In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)
from sklearn.tree import export_text

In [2]:
train = pd.read_csv("train_jqd04QH.csv")
test = pd.read_csv("test_KaymcHn.csv")
sample = pd.read_csv("sample_submission_sxfcbdx.csv")

In [78]:
train

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,Less_than_10,Funded Startup,1,69,0
2,46,city_16,0.910,Male,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0
4,21751,city_100,0.887,Male,No relevent experience,no_enrollment,Masters,STEM,8,<function mean at 0x0000000005391CA8>,Pvt Ltd,2,88,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18354,25366,city_103,0.920,Male,Has relevent experience,Full time course,Graduate,STEM,5,Less_than_10,Pvt Ltd,1,71,0
18355,25545,city_160,0.920,Male,No relevent experience,no_enrollment,Graduate,Humanities,15,50-99,Pvt Ltd,1,160,0
18356,11514,city_114,0.926,Male,Has relevent experience,no_enrollment,Masters,STEM,11,50-99,Pvt Ltd,3,18,0
18357,1689,city_75,0.939,Male,Has relevent experience,no_enrollment,Graduate,STEM,Greater_than_20,10-49,Pvt Ltd,3,41,0


In [4]:
train["city"].value_counts()

city_103    4358
city_21     1672
city_16     1654
city_114    1472
city_160     827
            ... 
city_121       3
city_129       3
city_111       3
city_171       2
city_140       1
Name: city, Length: 123, dtype: int64

In [5]:
train["training_hours"].value_counts()

28     307
12     291
18     285
20     269
22     263
      ... 
244      6
234      6
272      5
238      4
286      4
Name: training_hours, Length: 241, dtype: int64

In [6]:
train.describe()

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
count,18359.0,18359.0,18359.0,18359.0
mean,16729.360096,0.84714,65.899014,0.132088
std,9643.749725,0.110189,60.8853,0.338595
min,1.0,0.448,1.0,0.0
25%,8378.5,0.796,23.0,0.0
50%,16706.0,0.91,47.0,0.0
75%,25148.5,0.92,89.0,0.0
max,33380.0,0.949,336.0,1.0


In [7]:
train.isna().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    4098
relevent_experience          0
enrolled_university        342
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
target                       0
dtype: int64

In [8]:
train["gender"].value_counts()

Male      12884
Female     1188
Other       189
Name: gender, dtype: int64

In [9]:
train["gender"] = train["gender"].replace({np.NaN:"Male"})

In [10]:
train.isna().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                       0
relevent_experience          0
enrolled_university        342
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
target                       0
dtype: int64

In [11]:
train["enrolled_university"].value_counts()

no_enrollment       13659
Full time course     3187
Part time course     1171
Name: enrolled_university, dtype: int64

In [12]:
train["enrolled_university"] = train["enrolled_university"].replace({np.NaN:"no_enrollment"})

In [13]:
train.isna().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                       0
relevent_experience          0
enrolled_university          0
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
target                       0
dtype: int64

In [14]:
train["education_level"].value_counts()

Graduate          10769
Masters            4319
High School        2032
Phd                 459
Primary School      323
Name: education_level, dtype: int64

In [15]:
train["education_level"] = train["education_level"].replace({np.NaN:"Graduate"})

In [16]:
train["major_discipline"].value_counts()

STEM               13738
Humanities           688
Other                343
Business Degree      307
Arts                 239
No Major             206
Name: major_discipline, dtype: int64

In [17]:
train["major_discipline"] = train["major_discipline"].replace({np.NaN:"STEM"})

In [18]:
train["experience"].value_counts()

>20    3437
5      1309
4      1250
3      1159
6      1125
2       992
9       979
10      967
7       950
8       755
15      695
11      667
14      602
16      549
12      497
1       452
<1      416
13      412
17      347
19      308
18      286
20      146
Name: experience, dtype: int64

In [19]:
train["experience"] = train["experience"].replace({np.NaN:np.mean})

In [20]:
train["experience"] = train["experience"].replace({"<1":"Less_than_1year",">20":"Greater_than_20"})

In [21]:
train["company_size"].value_counts()

50-99        3120
100-500      2698
10000+       2044
10/49        1466
1000-4999    1399
<10          1360
500-999       902
5000-9999     591
Name: company_size, dtype: int64

In [22]:
train["company_size"] = train["company_size"].replace({"10000+":"Greater_than_10000","<10":"Less_than_10","10/49":"10-49"})

In [23]:
train["company_size"] = train["company_size"].replace({np.NaN:np.mean})

In [24]:
train["company_type"].value_counts()

Pvt Ltd                10051
Funded Startup          1038
Public Sector            996
Early Stage Startup      582
NGO                      534
Other                    119
Name: company_type, dtype: int64

In [25]:
train["company_type"] = train["company_type"].replace({np.NaN:"Pvt Ltd"})

In [26]:
train["last_new_job"].value_counts()

1        7567
>4       3339
2        2835
never    2186
4        1038
3        1027
Name: last_new_job, dtype: int64

In [27]:
train["last_new_job"] = train["last_new_job"].replace({">4":"Greater_than_4"})

In [28]:
train["last_new_job"] = train["last_new_job"].replace({np.NaN:1})

In [29]:
train.isna().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

In [30]:
train["training_hours"].value_counts()

28     307
12     291
18     285
20     269
22     263
      ... 
244      6
234      6
272      5
238      4
286      4
Name: training_hours, Length: 241, dtype: int64

In [31]:
train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,Less_than_10,Funded Startup,1,69,0
2,46,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0
4,21751,city_100,0.887,Male,No relevent experience,no_enrollment,Masters,STEM,8,<function mean at 0x0000000005391CA8>,Pvt Ltd,2,88,1


In [32]:
columns_to_encode = ["city","gender","education_level","relevent_experience","enrolled_university","major_discipline","experience","company_size","company_type","last_new_job"]

In [33]:
dummy = pd.get_dummies(train[columns_to_encode])

In [34]:
model_df = pd.concat([train.drop(["enrollee_id","city","gender","education_level","relevent_experience","enrolled_university","major_discipline","experience","company_size","company_type","last_new_job"],axis=1),dummy],axis=1)

In [35]:
model_df.head()

Unnamed: 0,city_development_index,training_hours,target,city_city_1,city_city_10,city_city_100,city_city_101,city_city_102,city_city_103,city_city_104,...,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,last_new_job_1,last_new_job_1.1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_Greater_than_4,last_new_job_never
0,0.689,106,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0.923,69,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.91,4,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3,0.666,26,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,0.887,88,1,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [36]:
X = model_df.drop(["target"],axis=1)
y = model_df["target"]

In [37]:
y.value_counts()

0    15934
1     2425
Name: target, dtype: int64

In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=101)

In [39]:
#Since there is class imbalance here we are oversampling the minority class

In [40]:
z = pd.concat([X_train,y_train],axis=1)

In [41]:
#Seperate minority and majority class

In [42]:
notlooking_for_jobchange = z[z.target==0]
looking_for_jobchange = z[z.target==1]

In [43]:
from sklearn.utils import resample

In [44]:
looking_for_jobchange_upsampled = resample(looking_for_jobchange,replace = True,n_samples = len(notlooking_for_jobchange), random_state = 27)

In [45]:
upsampled = pd.concat([notlooking_for_jobchange,looking_for_jobchange_upsampled])

In [46]:
upsampled.target.value_counts()

1    12750
0    12750
Name: target, dtype: int64

In [47]:
upsampled.head()

Unnamed: 0,city_development_index,training_hours,city_city_1,city_city_10,city_city_100,city_city_101,city_city_102,city_city_103,city_city_104,city_city_105,...,company_type_Public Sector,company_type_Pvt Ltd,last_new_job_1,last_new_job_1.1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_Greater_than_4,last_new_job_never,target
5309,0.802,101,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
11152,0.624,96,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
14194,0.866,19,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4561,0.647,33,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4427,0.754,30,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [48]:
x = upsampled.drop(["target"],axis=1)
Y = upsampled["target"]

In [49]:
#Again splitting the oversampled data into train and test

In [50]:
x_train,x_test,Y_train,Y_test = train_test_split(x,Y,test_size=0.3,random_state=101)

In [51]:
#Logistic Regression

In [52]:
log = LogisticRegression()
log.fit(x_train,Y_train)
pred = log.predict(x_test)
print(accuracy_score(pred,Y_test))

0.6469281045751634


In [53]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log, x_train, Y_train, cv=10)
scores

array([0.65098039, 0.62857143, 0.64145658, 0.64705882, 0.65378151,
       0.64145658, 0.62857143, 0.65154062, 0.62577031, 0.63641457])

In [54]:
#Decision Tree

In [56]:
def fit_predict(train, test, Y_train, Y_test, scaler, max_depth,
               criterion ="entropy",max_features=1, min_samples_split=2):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)
    dt = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,
                                random_state=42, max_features=max_features,
                                 min_samples_split = min_samples_split)
    dt.fit(train_scaled,Y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(Y_test, y_pred))

In [55]:
tree = DecisionTreeClassifier()
tree.fit(x_train,Y_train)
predtree = tree.predict(x_test)
print(accuracy_score(Y_test,predtree))

0.8984313725490196


In [None]:
#Maxdepth Tuning

In [57]:
max_depth = []
for i in range(1,30):
    print("Accuracy using maxdepth =",i, end = ": ")
    fit_predict(x_train,x_test, Y_train, Y_test, StandardScaler(),i)

Accuracy using maxdepth = 1: 0.4945098039215686
Accuracy using maxdepth = 2: 0.494640522875817
Accuracy using maxdepth = 3: 0.494640522875817
Accuracy using maxdepth = 4: 0.49712418300653594
Accuracy using maxdepth = 5: 0.49856209150326797
Accuracy using maxdepth = 6: 0.49751633986928107
Accuracy using maxdepth = 7: 0.5104575163398692
Accuracy using maxdepth = 8: 0.5124183006535947
Accuracy using maxdepth = 9: 0.4992156862745098
Accuracy using maxdepth = 10: 0.5019607843137255
Accuracy using maxdepth = 11: 0.5006535947712418
Accuracy using maxdepth = 12: 0.5023529411764706
Accuracy using maxdepth = 13: 0.5094117647058823
Accuracy using maxdepth = 14: 0.5083660130718954
Accuracy using maxdepth = 15: 0.5094117647058823
Accuracy using maxdepth = 16: 0.5122875816993464
Accuracy using maxdepth = 17: 0.5086274509803922
Accuracy using maxdepth = 18: 0.5258823529411765
Accuracy using maxdepth = 19: 0.5406535947712419
Accuracy using maxdepth = 20: 0.537516339869281
Accuracy using maxdepth = 21:

In [None]:
#Here maxdepth = 27

In [None]:
#Max Feature

In [60]:
for i in np.arange(0.1,1.0,0.1):
    print("Accuracy using maxfeature :",i, end = ": ")
    fit_predict(x_train, x_test, Y_train, Y_test, StandardScaler(), max_depth = 27, max_features=i)

Accuracy using maxfeature : 0.1: 0.7881045751633987
Accuracy using maxfeature : 0.2: 0.7806535947712419
Accuracy using maxfeature : 0.30000000000000004: 0.8145098039215686
Accuracy using maxfeature : 0.4: 0.8226143790849674
Accuracy using maxfeature : 0.5: 0.8158169934640523
Accuracy using maxfeature : 0.6: 0.8283660130718954
Accuracy using maxfeature : 0.7000000000000001: 0.8308496732026144
Accuracy using maxfeature : 0.8: 0.8205228758169935
Accuracy using maxfeature : 0.9: 0.84


In [None]:
#max_feature = 0.9

In [None]:
#Minsamplesplit

In [62]:
for i in range(2,20):
    print("Accuracy using Min_sample_split =", i, end = ': ')
    fit_predict(x_train, x_test, Y_train, Y_test, StandardScaler(), max_depth = 27, max_features = 0.9, min_samples_split = i)

Accuracy using Min_sample_split = 2: 0.84
Accuracy using Min_sample_split = 3: 0.8329411764705882
Accuracy using Min_sample_split = 4: 0.8274509803921568
Accuracy using Min_sample_split = 5: 0.8220915032679739
Accuracy using Min_sample_split = 6: 0.8113725490196079
Accuracy using Min_sample_split = 7: 0.8198692810457516
Accuracy using Min_sample_split = 8: 0.8077124183006535
Accuracy using Min_sample_split = 9: 0.8118954248366013
Accuracy using Min_sample_split = 10: 0.8105882352941176
Accuracy using Min_sample_split = 11: 0.8031372549019608
Accuracy using Min_sample_split = 12: 0.8
Accuracy using Min_sample_split = 13: 0.8049673202614379
Accuracy using Min_sample_split = 14: 0.8026143790849674
Accuracy using Min_sample_split = 15: 0.7915032679738562
Accuracy using Min_sample_split = 16: 0.7945098039215687
Accuracy using Min_sample_split = 17: 0.7884967320261438
Accuracy using Min_sample_split = 18: 0.7895424836601307
Accuracy using Min_sample_split = 19: 0.7887581699346405


In [None]:
#Here minsamplesplit = 2

In [None]:
#Criteriontuning

In [63]:
for i in ["entropy","gini"] :
    print("Accuracy using Criterion =", i, end = ': ')
    fit_predict(x_train, x_test, Y_train, Y_test, StandardScaler(), max_depth = 27, max_features = 0.9, min_samples_split = 2,
                criterion = i)

Accuracy using Criterion = entropy: 0.84
Accuracy using Criterion = gini: 0.8616993464052287


In [65]:
tree1 = DecisionTreeClassifier(criterion = "gini",max_depth =27,
                             max_features =0.9, min_samples_split = 2, random_state = 42)

In [66]:
tree1.fit(x_train,Y_train)

DecisionTreeClassifier(max_depth=27, max_features=0.9, random_state=42)

In [83]:
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score

In [68]:
pred_tree1 = tree1.predict(x_test)
print(accuracy_score(Y_test,pred_tree1))
print(roc_auc_score(Y_test,pred_tree1))
print(confusion_matrix(Y_test,pred_tree1))
print(classification_report(Y_test,pred_tree1))

0.8616993464052287
0.8624894309953454
[[3057  810]
 [ 248 3535]]
              precision    recall  f1-score   support

           0       0.92      0.79      0.85      3867
           1       0.81      0.93      0.87      3783

    accuracy                           0.86      7650
   macro avg       0.87      0.86      0.86      7650
weighted avg       0.87      0.86      0.86      7650



In [None]:
#Random Forest

In [69]:
rf = RandomForestClassifier()
rf.fit(x_train, Y_train)
y_predrf = rf.predict(x_test)
print("RandomForest Accuracy:",accuracy_score(Y_test, y_predrf))

RandomForest Accuracy: 0.9588235294117647


In [59]:
from sklearn.model_selection import GridSearchCV

In [71]:
params = {"n_estimators" : [200, 300, 500,900], "max_depth" : [2, 4, 6], "min_samples_leaf" : [2, 3 ,5]}

In [72]:
gsv = GridSearchCV(rf, params, n_jobs=-1,verbose=3)

In [73]:
gsv.fit(x_train, Y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   59.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  5.8min finished


GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [2, 4, 6], 'min_samples_leaf': [2, 3, 5],
                         'n_estimators': [200, 300, 500, 900]},
             verbose=3)

In [74]:
gsv.best_params_

{'max_depth': 6, 'min_samples_leaf': 2, 'n_estimators': 500}

In [75]:
rf1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                            max_depth=6, max_features='auto', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=2, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
                            oob_score=True, random_state=101, verbose=0, warm_start=False)

In [80]:
rf1.fit(x_train,Y_train)
pred_rf1 = rf1.predict(x_test)
print(accuracy_score(Y_test, pred_rf1))

0.6495424836601307


In [81]:
print(roc_auc_score(Y_test,pred_rf1))
print(confusion_matrix(Y_test,pred_rf1))
print(classification_report(Y_test,pred_rf1))

0.6492375243704892
[[2618 1249]
 [1432 2351]]
              precision    recall  f1-score   support

           0       0.65      0.68      0.66      3867
           1       0.65      0.62      0.64      3783

    accuracy                           0.65      7650
   macro avg       0.65      0.65      0.65      7650
weighted avg       0.65      0.65      0.65      7650



In [82]:
rf1.oob_score_

0.6491876750700281

In [83]:
FI = rf1.feature_importances_

In [85]:
sorted(list(zip(FI,x_train.columns)),reverse=True)

[(0.18282906699211812, 'city_development_index'),
 (0.11142796081742305, 'city_city_21'),
 (0.10136320678771811, 'company_size_<function mean at 0x00000000053B4CA8>'),
 (0.05188135321672432, 'enrolled_university_no_enrollment'),
 (0.04775487969724027, 'enrolled_university_Full time course'),
 (0.039432939712215766, 'relevent_experience_Has relevent experience'),
 (0.03907913864250712, 'relevent_experience_No relevent experience'),
 (0.029834063939587693, 'experience_Greater_than_20'),
 (0.023423716830524414, 'training_hours'),
 (0.01719272383680361, 'experience_1'),
 (0.015329772825490477, 'city_city_16'),
 (0.014502791873576105, 'experience_Less_than_1year'),
 (0.013690652270957407, 'city_city_136'),
 (0.012763324642408478, 'education_level_Graduate'),
 (0.012586772652337489, 'last_new_job_1'),
 (0.012316521617764175, 'city_city_114'),
 (0.011114446601043238, 'company_size_100-500'),
 (0.010315762146453107, 'last_new_job_never'),
 (0.009873937453346129, 'company_size_50-99'),
 (0.0092

In [None]:
#KNN

In [86]:
scaler = StandardScaler()

In [87]:
x_train_std = scaler.fit_transform(x_train)

In [88]:
from sklearn.neighbors import KNeighborsClassifier

In [89]:
knn = KNeighborsClassifier(n_neighbors=1)

In [91]:
knn.fit(x_train_std,Y_train)

KNeighborsClassifier(n_neighbors=1)

In [92]:
x_test_std = scaler.transform(x_test)

In [93]:
pred_knn = knn.predict(x_test_std)

In [94]:
print(accuracy_score(pred_knn,Y_test))
print(classification_report(Y_test, pred_knn),confusion_matrix(Y_test, pred_knn))

0.9111111111111111
              precision    recall  f1-score   support

           0       0.99      0.83      0.90      3867
           1       0.85      0.99      0.92      3783

    accuracy                           0.91      7650
   macro avg       0.92      0.91      0.91      7650
weighted avg       0.92      0.91      0.91      7650
 [[3216  651]
 [  29 3754]]


In [None]:
#Tuning hyperparameter

In [95]:
params_knn = {"n_neighbors":range(1,30)}

In [96]:
gsvk = GridSearchCV(knn,params_knn,verbose=3, n_jobs=-1)

In [97]:
gsvk.fit(x_train, Y_train)

Fitting 5 folds for each of 29 candidates, totalling 145 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 145 out of 145 | elapsed:  2.8min finished


GridSearchCV(estimator=KNeighborsClassifier(n_neighbors=1), n_jobs=-1,
             param_grid={'n_neighbors': range(1, 30)}, verbose=3)

In [98]:
gsvk.best_params_

{'n_neighbors': 1}

In [99]:
knnr1 = KNeighborsClassifier(n_neighbors=1)

In [101]:
knnr1.fit(x_train_std,Y_train)
pred_knnr1 = knnr1.predict(x_test_std)
print(accuracy_score(pred_knnr1,Y_test))
print(classification_report(Y_test, pred_knnr1))
print(confusion_matrix(Y_test, pred_knnr1))

0.9111111111111111
              precision    recall  f1-score   support

           0       0.99      0.83      0.90      3867
           1       0.85      0.99      0.92      3783

    accuracy                           0.91      7650
   macro avg       0.92      0.91      0.91      7650
weighted avg       0.92      0.91      0.91      7650

[[3216  651]
 [  29 3754]]


In [None]:
#Adaboost

In [102]:
from sklearn.ensemble import AdaBoostClassifier

In [104]:
dtree = DecisionTreeClassifier(criterion ="gini",max_depth =1)

In [105]:
adabst = AdaBoostClassifier(base_estimator = dtree)

In [106]:
adabst.fit(x_train,Y_train)
pred_ada = adabst.predict(x_test)
print(accuracy_score(Y_test,pred_ada))

0.6481045751633987


In [108]:
paramsada = {'n_estimators': [200,500,700,900]}

In [111]:
gsvada = GridSearchCV(adabst, paramsada,verbose=3,n_jobs=-1)

In [112]:
gsvada.fit(x_train,Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.0min finished


GridSearchCV(estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1)),
             n_jobs=-1, param_grid={'n_estimators': [200, 500, 700, 900]},
             verbose=3)

In [113]:
gsvada.best_params_

{'n_estimators': 500}

In [114]:
adabst1 = AdaBoostClassifier(base_estimator = dtree, n_estimators = 500,
                            random_state = 42)

In [115]:
adabst1.fit(x_train, Y_train)
pred_adabst1 = adabst1.predict(x_test)
print(accuracy_score(pred_adabst1,Y_test))
print(classification_report(Y_test, pred_adabst1))
print(confusion_matrix(Y_test, pred_adabst1))

0.649281045751634
              precision    recall  f1-score   support

           0       0.65      0.67      0.66      3867
           1       0.65      0.63      0.64      3783

    accuracy                           0.65      7650
   macro avg       0.65      0.65      0.65      7650
weighted avg       0.65      0.65      0.65      7650

[[2587 1280]
 [1403 2380]]


In [None]:
#GradientBoostClassifier

In [52]:
from sklearn.ensemble import GradientBoostingClassifier

In [53]:
gbc = GradientBoostingClassifier()

In [55]:
gbc.fit(x_train,Y_train)
pred_gbc = gbc.predict(x_test)
print(accuracy_score(pred_gbc,Y_test))

0.6675816993464052


In [57]:
paramsgbc = {'n_estimators': [200,500,700,900], 'max_depth': [4, 5, 8], 'min_samples_leaf': [2, 3, 5]}

In [60]:
gsvgbc = GridSearchCV(gbc, paramsgbc, verbose=3, n_jobs=-1)

In [61]:
gsvgbc.fit(x_train,Y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 47.6min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 90.2min finished


GridSearchCV(estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'max_depth': [4, 5, 8], 'min_samples_leaf': [2, 3, 5],
                         'n_estimators': [200, 500, 700, 900]},
             verbose=3)

In [62]:
gsvgbc.best_params_

{'max_depth': 8, 'min_samples_leaf': 2, 'n_estimators': 900}

In [63]:
gbc2 = GradientBoostingClassifier(n_estimators = 900, max_depth = 8, min_samples_leaf = 2, min_samples_split = 2, random_state=42)

In [66]:
gbc2.fit(x_train, Y_train)
predict_gbc2 = gbc2.predict(x_test)
print(accuracy_score(Y_test, predict_gbc2))
print(roc_auc_score(Y_test,predict_gbc2))
print(classification_report(Y_test, predict_gbc2))
print(confusion_matrix(Y_test, predict_gbc2))

0.9333333333333333
0.9339569225519335
              precision    recall  f1-score   support

           0       0.99      0.88      0.93      3867
           1       0.89      0.99      0.94      3783

    accuracy                           0.93      7650
   macro avg       0.94      0.93      0.93      7650
weighted avg       0.94      0.93      0.93      7650

[[3392  475]
 [  35 3748]]


In [None]:
#LGBM Classifier

In [92]:
from lightgbm import LGBMModel,LGBMClassifier

In [93]:
lgb = LGBMClassifier()

In [95]:
lgb.fit(x_train, Y_train)
predict_lgb = lgb.predict(x_test)
print(accuracy_score(Y_test, predict_lgb))

0.7411764705882353


In [96]:
paramslgb = {'n_estimators': [200,500,700,900], 'max_depth': [4, 5, 8], 'min_samples_leaf': [2, 3, 5]}

In [97]:
gsvlgb = GridSearchCV(lgb, paramslgb, verbose=3, n_jobs=-1)

In [98]:
gsvlgb.fit(x_train, Y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  3.8min finished


GridSearchCV(estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'max_depth': [4, 5, 8], 'min_samples_leaf': [2, 3, 5],
                         'n_estimators': [200, 500, 700, 900]},
             verbose=3)

In [99]:
gsvlgb.best_params_

{'max_depth': 8, 'min_samples_leaf': 2, 'n_estimators': 900}

In [100]:
lgb1 = LGBMClassifier(max_depth = 8, min_samples_split = 2, min_samples_leaf = 2, n_estimators = 900)

In [101]:
lgb1.fit(x_train, Y_train)
predict_lgb1 = lgb1.predict(x_test)
print(accuracy_score(Y_test, predict_lgb1))
print(roc_auc_score(Y_test,predict_lgb1))
print(classification_report(Y_test, predict_lgb1))
print(confusion_matrix(Y_test, predict_lgb1))

0.88
0.8807176443880353
              precision    recall  f1-score   support

           0       0.94      0.82      0.87      3867
           1       0.83      0.95      0.89      3783

    accuracy                           0.88      7650
   macro avg       0.89      0.88      0.88      7650
weighted avg       0.89      0.88      0.88      7650

[[3153  714]
 [ 204 3579]]


In [None]:
#SVM

In [80]:
from sklearn.svm import SVC

In [103]:
sv = SVC()

In [104]:
sv.fit(x_train, Y_train)
predict_sv = sv.predict(x_test)
print(accuracy_score(Y_test, predict_sv))

0.5551633986928105


In [105]:
paramssvc = {'C':[1.0,10.0,100.0,0.1],'gamma':[1,0.1,0.01,0.001]}

In [106]:
gsvsvc = GridSearchCV(sv, paramssvc, verbose=3, n_jobs=-1)

In [107]:
gsvsvc.fit(x_train, Y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 26.7min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 76.4min finished


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [1.0, 10.0, 100.0, 0.1],
                         'gamma': [1, 0.1, 0.01, 0.001]},
             verbose=3)

In [108]:
gsvsvc.best_params_

{'C': 1.0, 'gamma': 1}

In [81]:
svc1 = SVC(C = 1.0 , gamma = 1)

In [84]:
svc1.fit(x_train, Y_train)
predict_svc1 = svc1.predict(x_test)
print(accuracy_score(Y_test, predict_svc1))
print(roc_auc_score(Y_test,predict_svc1))
print(classification_report(Y_test, predict_svc1))
print(confusion_matrix(Y_test, predict_svc1))

0.9954248366013072
0.9953912679873027
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3867
           1       1.00      0.99      1.00      3783

    accuracy                           1.00      7650
   macro avg       1.00      1.00      1.00      7650
weighted avg       1.00      1.00      1.00      7650

[[3861    6]
 [  29 3754]]


In [None]:
#Test data Analaysis

In [51]:
test.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,16548,city_33,0.448,,No relevent experience,Full time course,Graduate,STEM,<1,1000-4999,Public Sector,,15
1,12036,city_28,0.939,Male,No relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,1.0,94
2,11061,city_103,0.92,Male,No relevent experience,Full time course,Graduate,STEM,3,,,1.0,17
3,5032,city_104,0.924,Male,No relevent experience,no_enrollment,Phd,STEM,>20,50-99,Pvt Ltd,2.0,76
4,17599,city_77,0.83,Male,Has relevent experience,no_enrollment,Graduate,STEM,6,<10,Pvt Ltd,2.0,65


In [52]:
test.isna().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    3388
relevent_experience          0
enrolled_university        279
education_level            395
major_discipline          2393
experience                  44
company_size              4051
company_type              4330
last_new_job               304
training_hours               0
dtype: int64

In [54]:
test["gender"].value_counts()

Male      13966
Female      897
Other       158
Name: gender, dtype: int64

In [53]:
test["gender"] = test["gender"].replace({np.NaN:"Male"})

In [55]:
test["enrolled_university"].value_counts()

no_enrollment       11228
Full time course     2565
Part time course      949
Name: enrolled_university, dtype: int64

In [56]:
test["enrolled_university"] = test["enrolled_university"].replace({np.NaN:"no_enrollment"})

In [57]:
test["education_level"].value_counts()

Graduate          8743
Masters           3478
High School       1676
Phd                422
Primary School     307
Name: education_level, dtype: int64

In [58]:
test["education_level"] = test["education_level"].replace({np.NaN:"Graduate"})

In [59]:
test["major_discipline"].value_counts()

STEM               11117
Humanities           524
Other                335
Business Degree      278
Arts                 193
No Major             181
Name: major_discipline, dtype: int64

In [60]:
test["major_discipline"] = test["major_discipline"].replace({np.NaN:"STEM"})

In [61]:
test["experience"].value_counts()

>20    2713
5      1091
3      1007
4       990
6       914
7       829
10      786
2       764
9       743
8       668
15      549
11      523
14      478
16      415
12      408
<1      366
1       366
13      351
17      312
19      282
18      247
20      175
Name: experience, dtype: int64

In [63]:
test["experience"] = test["experience"].replace({np.NaN:np.mean})

In [62]:
test["experience"] = test["experience"].replace({"<1":"Less_than_1year",">20":"Greater_than_20"})

In [64]:
test["company_size"].value_counts()

50-99        2577
100-500      2147
10000+       1622
10/49        1198
1000-4999    1114
<10          1104
500-999       737
5000-9999     471
Name: company_size, dtype: int64

In [65]:
test["company_size"] = test["company_size"].replace({"10000+":"Greater_than_10000","<10":"Less_than_10","10/49":"10-49"})

In [66]:
test["company_size"] = test["company_size"].replace({np.NaN:np.mean})

In [67]:
test["company_type"].value_counts()

Pvt Ltd                8063
Funded Startup          842
Public Sector           796
Early Stage Startup     447
NGO                     439
Other                   104
Name: company_type, dtype: int64

In [68]:
test["company_type"] = test["company_type"].replace({np.NaN:"Pvt Ltd"})

In [69]:
test["last_new_job"].value_counts()

1        6246
>4       2684
2        2298
never    1757
3         895
4         837
Name: last_new_job, dtype: int64

In [70]:
test["last_new_job"] = test["last_new_job"].replace({">4":"Greater_than_4"})

In [71]:
test.isna().sum()

enrollee_id                 0
city                        0
city_development_index      0
gender                      0
relevent_experience         0
enrolled_university         0
education_level             0
major_discipline            0
experience                  0
company_size                0
company_type                0
last_new_job              304
training_hours              0
dtype: int64

In [72]:
test["last_new_job"] = test["last_new_job"].replace({np.NaN:1})

In [73]:
columns_to_encode1 = ["city","gender","education_level","relevent_experience","enrolled_university","major_discipline","experience","company_size","company_type","last_new_job"]

In [74]:
dummy1 = pd.get_dummies(test[columns_to_encode1])

In [75]:
model_df1 = pd.concat([test.drop(["enrollee_id","city","gender","education_level","relevent_experience","enrolled_university","major_discipline","experience","company_size","company_type","last_new_job"],axis=1),dummy1],axis=1)

In [76]:
model_df1

Unnamed: 0,city_development_index,training_hours,city_city_1,city_city_10,city_city_100,city_city_101,city_city_102,city_city_103,city_city_104,city_city_105,...,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,last_new_job_1,last_new_job_1.1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_Greater_than_4,last_new_job_never
0,0.448,15,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,0.939,94,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0.920,17,0,0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
3,0.924,76,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0
4,0.830,65,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15016,0.762,68,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
15017,0.624,320,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
15018,0.910,13,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
15019,0.666,38,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [77]:
sample

Unnamed: 0,enrollee_id,target
0,16548,0
1,12036,0
2,11061,0
3,5032,0
4,17599,0
...,...,...
15016,11308,0
15017,14612,0
15018,33346,0
15019,14506,0


In [None]:
#Predicting using SVM model

In [85]:
pred_sample1 = svc1.predict(model_df1)

In [86]:
sample["target"] = pred_sample1

In [87]:
sample.to_csv('sample_HRAnalyticsss.csv', header=True, index=False)