**Importing Library**

In [None]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.3.0-py2.py3-none-any.whl (82 kB)
[?25l[K     |████                            | 10 kB 15.7 MB/s eta 0:00:01[K     |████████                        | 20 kB 19.6 MB/s eta 0:00:01[K     |████████████                    | 30 kB 6.6 MB/s eta 0:00:01[K     |████████████████                | 40 kB 8.3 MB/s eta 0:00:01[K     |████████████████████            | 51 kB 6.1 MB/s eta 0:00:01[K     |████████████████████████        | 61 kB 7.1 MB/s eta 0:00:01[K     |████████████████████████████    | 71 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████████████▉| 81 kB 5.9 MB/s eta 0:00:01[K     |████████████████████████████████| 82 kB 313 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.3.0


In [None]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")        
import matplotlib
import seaborn as sns
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import category_encoders as ce
from imblearn.over_sampling import  SMOTE
from sklearn.linear_model import LogisticRegression

In [None]:
aug_train = pd.read_csv('aug_train.csv')
aug_train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


**Dropping Uneeded Columns**

We will drop the uneeded columns based on our findings on the EDA part.

In [None]:
aug_train.drop(columns=['enrollee_id','city','gender','company_size','company_type','training_hours'], axis = 1, inplace=True)

In [None]:
aug_train.head()

Unnamed: 0,city_development_index,relevent_experience,enrolled_university,education_level,major_discipline,experience,last_new_job,target
0,0.92,Has relevent experience,no_enrollment,Graduate,STEM,>20,1,1.0
1,0.776,No relevent experience,no_enrollment,Graduate,STEM,15,>4,0.0
2,0.624,No relevent experience,Full time course,Graduate,STEM,5,never,0.0
3,0.789,No relevent experience,,Graduate,Business Degree,<1,never,1.0
4,0.767,Has relevent experience,no_enrollment,Masters,STEM,>20,4,0.0


**Handling missing values**

Next, we will try to handle the missing values. We can drop some of the Nan entries from columns which have a little missing data (~around 2%), and we will try to impute major_discipline with mode, since it's missing quite a lot of data.

In [None]:
def count_percent(data):
  df_cols = pd.DataFrame({'Count Missing': data.isnull().sum(),
                        'Percent Missing': data.isnull().sum()*100/data.shape[0]})
  return df_cols

count_percent(aug_train)

Unnamed: 0,Count Missing,Percent Missing
city_development_index,0,0.0
relevent_experience,0,0.0
enrolled_university,386,2.014824
education_level,460,2.401086
major_discipline,2813,14.683161
experience,65,0.339284
last_new_job,423,2.207955
target,0,0.0


In [None]:
aug_train.dropna(subset=['enrolled_university', 'education_level','last_new_job','experience'], inplace = True)

In [None]:
aug_train['major_discipline'].fillna(aug_train['major_discipline'].mode().iloc[0],inplace = True)


In [None]:
count_percent(aug_train)

Unnamed: 0,Count Missing,Percent Missing
city_development_index,0,0.0
relevent_experience,0,0.0
enrolled_university,0,0.0
education_level,0,0.0
major_discipline,0,0.0
experience,0,0.0
last_new_job,0,0.0
target,0,0.0


**Data Correction**



**Feature Engineering**

In [None]:
scl = ['city_development_index']

In [None]:
ordinal_mapping = [
    {'col':'education_level',
    'mapping':{'Primary School':1,'High School':2, 'Graduate':3, 'Masters':4, 'Phd':5}},
    {'col':'enrolled_university',
    'mapping':{'no_enrollment':1,'Part time course':2, 'Full time course':3}},
    {'col':'experience',
    'mapping':{'<1':1,'1':2, '2':3, '3':4, '4':5,'5':6,'6':7,'7':8,'8':9,'9':10,'10':11,'11':12,'12':13,'13':14,'14':15,'15':16,'16':17,'17':18,'18':19,'19':20,'20':21,'>20':22}},
    {'col':'last_new_job',
    'mapping':{'never':1,'1':2, '2':3, '3':4, '4':5,'>4':6}}    
    ]

ordinal_encoder = ce.OrdinalEncoder(cols = ['education_level','enrolled_university','experience','last_new_job'],mapping = ordinal_mapping)

In [None]:
transformer = ColumnTransformer([
                                 ('one hot encoder',OneHotEncoder(),['relevent_experience','major_discipline']),
                                 ('binary encoder',ordinal_encoder,['education_level','enrolled_university','experience','last_new_job']),
                                 ('Standard scaler',StandardScaler(),scl)
                                ], remainder = 'passthrough')

In [None]:
X = aug_train.drop(columns=['target'], axis = 1)
y = aug_train['target']

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, 
    y, 
    stratify = y,
    test_size = 0.2,)

In [None]:
X_train_preprocessed = transformer.fit_transform(X_train_val)
X_test_preprocessed = transformer.transform(X_test)

In [None]:
X_train_preprocessed = pd.DataFrame(X_train_preprocessed)
X_test_preprocessed = pd.DataFrame(X_test_preprocessed)
X_train_preprocessed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,13.0,2.0,0.639619
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,22.0,2.0,-0.636333
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,11.0,2.0,-1.699625
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,3.0,2.0,0.582365
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,6.0,1.0,0.377885
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14406,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,22.0,1.0,0.721411
14407,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,1.0,14.0,2.0,-1.094366
14408,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,14.0,5.0,-1.699625
14409,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,4.0,1.0,-1.094366


In [None]:
transformer.transformers_[0][1].get_feature_names()

array(['x0_Has relevent experience', 'x0_No relevent experience',
       'x1_Arts', 'x1_Business Degree', 'x1_Humanities', 'x1_No Major',
       'x1_Other', 'x1_STEM'], dtype=object)

In [None]:
transformer.transformers_[1][1].get_feature_names()

['education_level', 'enrolled_university', 'experience', 'last_new_job']

In [None]:
features = list(['x0_Has relevent experience', 'x0_No relevent experience',
       'x1_Arts', 'x1_Business Degree', 'x1_Humanities', 'x1_No Major',
       'x1_Other', 'x1_STEM','education_level', 'enrolled_university', 'experience', 'last_new_job','city_development_index'])

In [None]:
X_train_preprocessed.columns = features
X_test_preprocessed.columns = features
X_train_preprocessed

Unnamed: 0,x0_Has relevent experience,x0_No relevent experience,x1_Arts,x1_Business Degree,x1_Humanities,x1_No Major,x1_Other,x1_STEM,education_level,enrolled_university,experience,last_new_job,city_development_index
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,13.0,2.0,0.639619
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,22.0,2.0,-0.636333
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,11.0,2.0,-1.699625
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,3.0,2.0,0.582365
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,6.0,1.0,0.377885
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14406,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,22.0,1.0,0.721411
14407,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,1.0,14.0,2.0,-1.094366
14408,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,14.0,5.0,-1.699625
14409,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,4.0,1.0,-1.094366


In [51]:
X_train_preprocessed.shape

(14411, 13)

**Handling Imbalance**

Oversampling using SMOTE

In [None]:
smote = SMOTE()
X_over, y_over = smote.fit_resample(X_train_preprocessed, y_train_val)

In [52]:
X_over.shape

(21748, 13)

In [53]:
y_train_val.value_counts()

0.0    10874
1.0     3537
Name: target, dtype: int64

In [54]:
y_over.value_counts()

1.0    10874
0.0    10874
Name: target, dtype: int64

**Preprocessed data splitting**


In [None]:
X_train_over, X_val_over, y_train_over, y_val_over = train_test_split(
    X_over,
    y_over, 
    stratify = y_over,
    test_size = 0.20)

In [57]:
dt = DecisionTreeClassifier(max_depth = 10)
dt.fit(X_train_over, y_train_over)
y_pred = dt.predict(X_val_over)

In [58]:
recall_score(y_val_over,y_pred)

0.6694252873563218

In [59]:
accuracy_score(y_val_over,y_pred)

0.7390804597701149

In [60]:
precision_score(y_val_over,y_pred)

0.7777777777777778

In [49]:
f1_score(y_val_over,y_pred)

0.7213033818810171

In [42]:
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train_over, y_train_over)
y_pred = rf.predict(X_val_over)

In [43]:
recall_score(y_val_over,y_pred)

0.7567816091954023

In [44]:
accuracy_score(y_val_over,y_pred)

0.7806896551724138

In [45]:
f1_score(y_val_over,y_pred)

0.7753179463024022

In [None]:
lr = LogisticRegression()
lr.fit(X_train_over, y_train_over)
y_pred = lr.predict(X_val_over)

In [None]:
recall_score(y_val_over,y_pred)

0.6372413793103449

In [None]:
accuracy_score(y_val_over,y_pred)

0.6841379310344827

In [None]:
f1_score(y_val_over,y_pred)

0.6685962373371925