In [1]:
#loading the libraries
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import lightgbm as gbm

In [2]:
os.chdir('D:/janata_hack/')

In [3]:
ls

 Volume in drive D is New Volume
 Volume Serial Number is 9C5A-0CCF

 Directory of D:\janata_hack

10-05-2020  02:33    <DIR>          .
10-05-2020  02:33    <DIR>          ..
10-05-2020  02:33           160,246 rf_test.csv
09-05-2020  02:09           115,182 sample_submission_sxfcbdx.csv
09-05-2020  02:09         1,453,325 test_KaymcHn.csv
09-05-2020  02:09         1,817,701 train_jqd04QH.csv
               4 File(s)      3,546,454 bytes
               2 Dir(s)  308,895,301,632 bytes free


In [4]:
#reading the training and testing data
train_data = pd.read_csv('train_jqd04QH.csv')
test_data = pd.read_csv('test_KaymcHn.csv')

In [5]:
#shape of training and testing data
print("Shape of training data:", train_data.shape)
print("Shape of testing data:", test_data.shape)

Shape of training data: (18359, 14)
Shape of testing data: (15021, 13)


In [6]:
#combining test and train into one dataframe because some variables might have more or less categories in the train or test data
train_data['source']='train'
test_data['source']='test'
final_df = pd.concat([train_data, test_data],ignore_index=True,sort=True)
data = final_df.copy()

In [7]:
print("Shape of data", data.shape)

Shape of data (33380, 15)


In [8]:
#check for datatypes
data.dtypes

city                       object
city_development_index    float64
company_size               object
company_type               object
education_level            object
enrolled_university        object
enrollee_id                 int64
experience                 object
gender                     object
last_new_job               object
major_discipline           object
relevent_experience        object
source                     object
target                    float64
training_hours              int64
dtype: object

In [9]:
#checking for missing value percentage
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
print(missing_value_df)

                        percent_missing
city                           0.000000
city_development_index         0.000000
company_size                  26.452966
company_type                  28.067705
education_level                2.552427
enrolled_university            1.860395
enrollee_id                    0.000000
experience                     0.308568
gender                        22.426603
last_new_job                   2.010186
major_discipline              15.671061
relevent_experience            0.000000
source                         0.000000
target                        45.000000
training_hours                 0.000000


In [10]:
#check for no of missing values
print(data.isnull().sum())

city                          0
city_development_index        0
company_size               8830
company_type               9369
education_level             852
enrolled_university         621
enrollee_id                   0
experience                  103
gender                     7486
last_new_job                671
major_discipline           5231
relevent_experience           0
source                        0
target                    15021
training_hours                0
dtype: int64


In [11]:
#converting categorical bins to ordinal data
def coding(col, codeDict):
    colCoded = pd.Series(col, copy=True)
    for key, value in codeDict.items():
        colCoded.replace(key, value, inplace=True)
    return colCoded

In [12]:
#Replacing 10/49 by 10-49 for consistency and converting it into a categorical ordering 
data.company_size.replace('10/49', '10-49', inplace=True)
data['company_size'] = coding(data['company_size'], {'<10':1,'10-49':2,'50-99':3,'100-500':4,\
                                                     '500-999':5, '1000-4999':6, '5000-9999':7, '10000+':8})
print(data.company_size.value_counts())

3.0    5697
4.0    4845
8.0    3666
2.0    2664
6.0    2513
1.0    2464
5.0    1639
7.0    1062
Name: company_size, dtype: int64


In [13]:
#Checking company type
print(data.company_type.value_counts())

Pvt Ltd                18114
Funded Startup          1880
Public Sector           1792
Early Stage Startup     1029
NGO                      973
Other                    223
Name: company_type, dtype: int64


In [14]:
#Company size and company type can be correlated logically
data.groupby('company_type')['company_size'].value_counts()

company_type         company_size
Early Stage Startup  1.0              511
                     2.0              288
                     3.0              185
                     4.0               38
                     5.0                2
Funded Startup       3.0              745
                     4.0              403
                     2.0              356
                     1.0              267
                     5.0              100
NGO                  4.0              334
                     3.0              144
                     6.0              130
                     8.0               80
                     5.0               69
                     1.0               62
                     2.0               55
                     7.0               54
Other                3.0               39
                     8.0               38
                     4.0               33
                     6.0               26
                     1.0               21


In [15]:
#Missing values as -999 for company size and mode for companytype
data['company_size'].fillna(-1, inplace=True)
data['company_type'].fillna('Pvt Ltd', inplace=True)

In [16]:
#Replacing company type and size according to the distributions, 
#Like replacing company size as 1(<10) for early startup company type in case of missing values
data['company_size'] = np.where((data['company_size'] == -1) & 
                                (data['company_type'] == 'Early Stage Startup'), 1, data['company_size'])
data['company_size'] = np.where((data['company_size'] == -1) & 
                                (data['company_type'] == 'Funded Startup'), 3, data['company_size'])
data['company_size'] = np.where((data['company_size'] == -1) & 
                                (data['company_type'] == 'NGO'), 4, data['company_size'])
data['company_size'] = np.where((data['company_size'] == -1) & 
                                (data['company_type'] == 'Other'), 6, data['company_size'])
data['company_size'] = np.where((data['company_size'] == -1) & 
                                (data['company_type'] == 'Public Sector'), 6, data['company_size'])
data['company_size'] = np.where((data['company_size'] == -1) & 
                                (data['company_type'] == 'Pvt Ltd'), 3, data['company_size'])

In [17]:
#Analyzing enrolled university
data['enrolled_university'].fillna('no_enrollment', inplace=True)
print(data['enrolled_university'].value_counts())

no_enrollment       25508
Full time course     5752
Part time course     2120
Name: enrolled_university, dtype: int64


In [18]:
#Analyzing education level
data['education_level'] = coding(data['education_level'], {'Primary School':1, 'High School':2, 
                                                          'Graduate':3, 'Masters':4, 'Phd':5})
print(data['education_level'].value_counts())

3.0    19512
4.0     7797
2.0     3708
5.0      881
1.0      630
Name: education_level, dtype: int64


In [19]:
#Analyzing experience
#Looking at the distribution dividing into 6 buckets with intervals of 5
data['experience'] = coding(data['experience'], {'<1':1,'1':1,'2':1,'3':1,'4':1,'5':1,'6':2,'7':2,'8':2,'9':2,'10':2,
'11':3,'12':3,'13':3,'14':3,'15':3,'16':4,'17':4,'18':4,'19':4,'20':4,'>20':5})
print(data['experience'].value_counts())

1.0    10162
2.0     8716
5.0     6150
3.0     5182
4.0     3067
Name: experience, dtype: int64


In [20]:
#Education and experience can be correlated logically
data.groupby('education_level')['experience'].value_counts()

education_level  experience
1.0              1.0            429
                 2.0            113
                 5.0             36
                 3.0             35
                 4.0             13
2.0              1.0           2026
                 2.0            926
                 3.0            307
                 5.0            265
                 4.0            176
3.0              1.0           5868
                 2.0           5388
                 5.0           3393
                 3.0           3081
                 4.0           1736
4.0              2.0           1983
                 5.0           1909
                 3.0           1559
                 1.0           1341
                 4.0            979
5.0              5.0            447
                 3.0            138
                 2.0            126
                 4.0            115
                 1.0             52
Name: experience, dtype: int64

In [21]:
#Replacing missing values
data['education_level'].fillna(3, inplace=True)
data['experience'].fillna(-1, inplace=True)

In [22]:
#Replacing education and experience according to distribution
data['experience'] = np.where((data['experience'] == -1) & 
                                (data['education_level'] == 1), 1, data['experience'])
data['experience'] = np.where((data['experience'] == -1) & 
                                (data['education_level'] == 2), 1, data['experience'])
data['experience'] = np.where((data['experience'] == -1) & 
                                (data['education_level'] == 3), 1, data['experience'])
data['experience'] = np.where((data['experience'] == -1) & 
                                (data['education_level'] == 4), 2, data['experience'])
data['experience'] = np.where((data['experience'] == -1) & 
                                (data['education_level'] == 5), 5, data['experience'])

In [23]:
#Replacing gender na values with mode
data['gender'].fillna('Male', inplace=True)
print(data['gender'].value_counts())

Male      30948
Female     2085
Other       347
Name: gender, dtype: int64


In [24]:
#Last new job
data['last_new_job'] = coding(data['last_new_job'], {'never':0,'1':1,'2':2,'3':3,'4':4,'>4':5})
data['last_new_job'].fillna(1, inplace=True)
print(data['last_new_job'].value_counts())

1.0    14484
5.0     6023
2.0     5133
0.0     3943
3.0     1922
4.0     1875
Name: last_new_job, dtype: int64


In [25]:
#Major discipline
data['major_discipline'].fillna('STEM', inplace=True)
print(data['major_discipline'].value_counts())

STEM               30086
Humanities          1212
Other                678
Business Degree      585
Arts                 432
No Major             387
Name: major_discipline, dtype: int64


In [26]:
#Relevant experience
data['relevent_experience']=data['relevent_experience'].map(lambda x: 0 if x == 'No relevent experience' else 1)

In [27]:
#Separte train and test data
data['source']=data['source'].map(lambda x: 0 if x == 'train' else 1)

In [28]:
#Checking for null_values
data.isnull().sum()

city                          0
city_development_index        0
company_size                  0
company_type                  0
education_level               0
enrolled_university           0
enrollee_id                   0
experience                    0
gender                        0
last_new_job                  0
major_discipline              0
relevent_experience           0
source                        0
target                    15021
training_hours                0
dtype: int64

In [29]:
data.head()

Unnamed: 0,city,city_development_index,company_size,company_type,education_level,enrolled_university,enrollee_id,experience,gender,last_new_job,major_discipline,relevent_experience,source,target,training_hours
0,city_149,0.689,4.0,Pvt Ltd,3.0,no_enrollment,23798,1.0,Male,1.0,STEM,1,0,0.0,106
1,city_83,0.923,1.0,Funded Startup,3.0,no_enrollment,29166,3.0,Male,1.0,STEM,1,0,0.0,69
2,city_16,0.91,3.0,Public Sector,3.0,no_enrollment,46,2.0,Male,2.0,STEM,1,0,0.0,4
3,city_64,0.666,3.0,Pvt Ltd,3.0,no_enrollment,18527,3.0,Male,1.0,STEM,1,0,0.0,26
4,city_100,0.887,3.0,Pvt Ltd,4.0,no_enrollment,21751,2.0,Male,2.0,STEM,0,0,1.0,88


In [30]:
data.training_hours.describe()

count    33380.000000
mean        65.565638
std         60.363572
min          1.000000
25%         23.000000
50%         47.000000
75%         89.000000
max        336.000000
Name: training_hours, dtype: float64

In [31]:
#check for correlation
data.corr()

Unnamed: 0,city_development_index,company_size,education_level,enrollee_id,experience,last_new_job,relevent_experience,source,target,training_hours
city_development_index,1.0,0.100313,0.032743,-0.033087,0.288075,0.173196,0.062545,-0.002066,-0.1357,0.003372
company_size,0.100313,1.0,0.146638,0.000804,0.125745,0.141082,0.070138,-0.007316,-0.021184,-0.005375
education_level,0.032743,0.146638,1.0,-0.019462,0.277382,0.202814,0.230039,-0.002872,-0.002285,-0.00475
enrollee_id,-0.033087,0.000804,-0.019462,1.0,-0.024008,-0.020753,-0.034115,-0.004458,0.033873,0.00238
experience,0.288075,0.125745,0.277382,-0.024008,1.0,0.45217,0.317712,-0.004133,-0.082359,-0.002724
last_new_job,0.173196,0.141082,0.202814,-0.020753,0.45217,1.0,0.23798,-0.002153,-0.029527,0.001234
relevent_experience,0.062545,0.070138,0.230039,-0.034115,0.317712,0.23798,1.0,-0.001661,-0.074088,0.005862
source,-0.002066,-0.007316,-0.002872,-0.004458,-0.004133,-0.002153,-0.001661,1.0,,-0.006106
target,-0.1357,-0.021184,-0.002285,0.033873,-0.082359,-0.029527,-0.074088,,1.0,-0.004735
training_hours,0.003372,-0.005375,-0.00475,0.00238,-0.002724,0.001234,0.005862,-0.006106,-0.004735,1.0


In [32]:
#dummies for categorical stuff and city cuz
gender_d=pd.get_dummies(data.gender, prefix='gender').iloc[:, 1:]
enrolled_university_d=pd.get_dummies(data.enrolled_university, prefix='enrolled_university')
major_discipline_d=pd.get_dummies(data.major_discipline, prefix='major_discipline').iloc[:, 1:]
company_type_d=pd.get_dummies(data.company_type, prefix='company_type').iloc[:, 1:]

In [33]:
#dropping columns for whom dummy was created and city as it does not seem useful
data.drop(['gender','enrolled_university','major_discipline','company_type','city'], axis=1, inplace=True)
data.shape

(33380, 10)

In [34]:
data=pd.concat([data, gender_d,enrolled_university_d,major_discipline_d,company_type_d], axis=1)
data.head()

Unnamed: 0,city_development_index,company_size,education_level,enrollee_id,experience,last_new_job,relevent_experience,source,target,training_hours,...,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd
0,0.689,4.0,3.0,23798,1.0,1.0,1,0,0.0,106,...,0,0,0,0,1,0,0,0,0,1
1,0.923,1.0,3.0,29166,3.0,1.0,1,0,0.0,69,...,0,0,0,0,1,1,0,0,0,0
2,0.91,3.0,3.0,46,2.0,2.0,1,0,0.0,4,...,0,0,0,0,1,0,0,0,1,0
3,0.666,3.0,3.0,18527,3.0,1.0,1,0,0.0,26,...,0,0,0,0,1,0,0,0,0,1
4,0.887,3.0,4.0,21751,2.0,2.0,0,0,1.0,88,...,0,0,0,0,1,0,0,0,0,1


In [35]:
#segregating train and test data like how they were 
train = data.loc[data['source']==0]
test = data.loc[data['source']==1]

In [36]:
train=train.drop('source',axis=1)
test=test.drop(['source','target'],axis=1)

In [37]:
#defining the independent and dependent variables for test and train
x_train=train.drop(['enrollee_id','target'],axis=1)
y_train=train['target']
x_test=test.drop('enrollee_id',axis=1)

#applying random forest
rf=RandomForestClassifier(n_estimators=100, n_jobs=-1)

#fitting a model
rf.fit(x_train,y_train)

#prediction
pred_rand=rf.predict(x_test)

#accuracy of the training data
rf.score(x_train,y_train)

0.9954790565934963

In [38]:
#exporting to csv
predictions = pd.DataFrame()
predictions['enrollee_id'] = test['enrollee_id']
predictions['target'] = pred_rand
predictions.to_csv('rf_test.csv', index=False)

In [39]:
#applying logisitc regression
logreg=LogisticRegression()

#fitting a model
logreg.fit(x_train,y_train)

#prediction
pred_log=logreg.predict(x_test)

#accuracy of the training data
logreg.score(x_train,y_train)



0.867912195653358

In [40]:
#exporting to csv
predictions = pd.DataFrame()
predictions['enrollee_id'] = test['enrollee_id']
predictions['target'] = pred_log
predictions.to_csv('logistic_test.csv', index=False)

In [41]:
#applying decision tree
clf=DecisionTreeClassifier()

#fitting a model
clf.fit(x_train,y_train)

#prediction
pred_dec=clf.predict(x_test)

#accuracy of the training data
clf.score(x_train,y_train)

0.995533525791165

In [42]:
#exporting to csv
predictions = pd.DataFrame()
predictions['enrollee_id'] = test['enrollee_id']
predictions['target'] = pred_dec
predictions.to_csv('decisiontree_test.csv', index=False)

In [43]:
xg=XGBClassifier(min_child_weight=10,colsample_bytree=0.8,max_depth=8,nthread=4,eta=0.1,subsample=0.8,alpha=0,n_jobs=-1)
xg.fit(x_train,y_train)
pred_xgb=xg.predict(x_test)
xg.score(x_train,y_train)

0.8698730867694319

In [44]:
predictions = pd.DataFrame()
predictions['enrollee_id'] = test['enrollee_id']
predictions['target'] = pred_xgb
predictions.to_csv('xgb_test.csv', index=False)

In [45]:
params = {"objective" : "binary",
        "num_leaves" : 120,
        "learning_rate" : 0.05,
        "bagging_fraction" : 0.8,
        "feature_fraction" : 0.9,
        "bagging_frequency" : 5,
        "verbosity" : -1,
        "max depth" : 10,
        "boosting_type" : "gbdt",
        "metric": "binary_logloss"
    }

train_data=gbm.Dataset(x_train,y_train)
model= gbm.train(params, train_data, 100)
pred_gbm=model.predict(x_test)

In [46]:
#exporting to csv
predictions = pd.DataFrame()
predictions['enrollee_id'] = test['enrollee_id']
predictions['target'] = pred_gbm
predictions.to_csv('gbm_test.csv', index=False)