#### <p style="text-align: center; font-size: 20px">1. Setup</p>

In [None]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, log_loss, plot_roc_curve, auc, precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="white", color_codes=True)
sns.set_style('ticks')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

#### <p style="text-align: center; font-size: 20px">2. Loading data</p>

In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
sample_submission = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv')

#### <p style="text-align: center; font-size: 20px">3. Exploratory Data Analysis</p>

<b>3.1 General Analysis</b>

In [None]:
print('Rows and Columns in train dataset:', train.shape)
print('Rows and Columns in test dataset:', test.shape)
print('The shape of sample submission dataset: ', sample_submission.shape)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
sample_submission.head()

In [None]:
print('Missing value in train dataset:', sum(train.isnull().sum()))
print('Missing value in test dataset:', sum(test.isnull().sum()))

<b>3.2 Features and Target Distribution</b>

Target 
- 0 - Not looking for job change
- 1 – Looking for a job change
  
As you can see, here we have imbalanced data.

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(train.target.value_counts().index.astype(int),
            train.target.value_counts().values, palette='bwr')
plt.ylabel('Number of rows', fontsize=12)
plt.xlabel('Target', fontsize=12)
plt.show();

Gender

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.gender, palette='bwr', hue=train.target.astype(int))
plt.ylabel('Count', fontsize=12)
plt.xlabel('Gender', fontsize=12)
plt.show();

Experience

<span style="font-size:10px">Experience in years</span>

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.relevent_experience, palette='bwr', hue=train.target.astype(int))
plt.ylabel('Count', fontsize=12)
plt.xlabel('Experience', fontsize=12)
plt.show();

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.experience, palette='bwr', hue=train.target.astype(int),
              order=train.experience.value_counts().index)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Experienca', fontsize=12)
plt.xticks(rotation=45)
plt.show();

In [None]:
def experience(x):
    if x=='<1'      :   return '<1'
    if x=='1'       :   return '1-10' 
    if x=='2'       :   return '1-10' 
    if x=='3'       :   return '1-10' 
    if x=='4'       :   return '1-10' 
    if x=='5'       :   return '1-10'
    if x=='6'       :   return '1-10' 
    if x=='7'       :   return '1-10' 
    if x=='8'       :   return '1-10'  
    if x=='9'       :   return '1-10'  
    if x=='10'      :   return '1-10'  
    if x=='11'      :   return '11-20'
    if x=='12'      :   return '11-20'
    if x=='13'      :   return '11-20' 
    if x=='14'      :   return '11-20' 
    if x=='15'      :   return '11-20' 
    if x=='16'      :   return '11-20'
    if x=='17'      :   return '11-20'
    if x=='18'      :   return '11-20'
    if x=='19'      :   return '11-20' 
    if x=='20'      :   return '11-20' 
    if x=='>20'     :   return '>20'

train_experince = train.copy()
train_experince['experience'] = train_experince['experience'].apply(experience) 

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train_experince.experience, palette='bwr', hue=train.target.astype(int),
              order=train_experince.experience.value_counts().index)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Experienca', fontsize=12)
plt.xticks(rotation=45)
plt.show();

Enrolled university

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.enrolled_university, palette='bwr', hue=train.target.astype(int))
plt.ylabel('Count', fontsize=12)
plt.xlabel('Enrolled', fontsize=12)
plt.show();

Education level

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.education_level, palette='bwr', hue=train.target.astype(int))
plt.ylabel('Count', fontsize=12)
plt.xlabel('Education', fontsize=12)
plt.show();

In [None]:
# pie chart

labels = train.education_level.value_counts().keys()
sizes = train.education_level.value_counts()

#add colors
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#f0f8ff']

#explsion
explode = (0.05, 0.05, 0.05, 0.05, 0.05)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
        startangle=10, pctdistance=0.85, explode=explode,
        labeldistance=1.1)

centre_circle = plt.Circle((0, 0), 0.75, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')
plt.tight_layout()
plt.show()

Major discipline

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.major_discipline, palette='bwr', hue=train.target.astype(int))
plt.ylabel('Count', fontsize=12)
plt.xlabel('Major discipline', fontsize=12)
plt.xticks(rotation=45)
plt.show();

Company type

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.company_type, palette='bwr', hue=train.target.astype(int))
plt.ylabel('Count', fontsize=12)
plt.xlabel('Company type', fontsize=12)
plt.xticks(rotation=45)
plt.show();

Company size

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.company_size, palette='bwr',
              hue=train.target.astype(int),
              order=train.company_size.value_counts().index)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Company type', fontsize=12)
plt.xticks(rotation=45)
plt.show();

Number of years between last and current job

In [None]:
# pie chart

labels = train.last_new_job.value_counts().keys()
sizes = train.last_new_job.value_counts()

#add colors
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#f0f8ff', '#E38C79']

#explsion
explode = (0.05, 0.05, 0.05, 0.05, 0.05, 0.05)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
        startangle=10, pctdistance=0.85, explode=explode,
        labeldistance=1.1)

centre_circle = plt.Circle((0, 0), 0.75, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(train.last_new_job, palette='bwr',
              hue=train.target.astype(int),
              order=train.last_new_job.value_counts().index)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Number of years', fontsize=12)
plt.xticks(rotation=45)
plt.show();

Training hours

In [None]:
plt.figure(figsize=(16, 5))
sns.distplot(train.training_hours, color='#202020')
plt.title('Distribution of training hours')
plt.xlabel('Hours', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.tight_layout()
plt.show()

#### <p style="text-align: center; font-size: 20px">4. Data Preprocessing</p>

Drop na and columns

In [None]:
train = train.drop(['enrollee_id', 'city', 'city_development_index', 'training_hours'], axis=1)
test = test.drop(['enrollee_id', 'city', 'city_development_index', 'training_hours'], axis=1)

train.experience = train.experience.apply(experience)
test.experience = test.experience.apply(experience) 

In [None]:
print('Train NA values: \n', train.isnull().sum())
print('Test NA values: \n', test.isnull().sum())

In [None]:
def impute_nan_most_frequent_category(DataFrame, ColName):
    # .mode()[0] - gives first category name
    most_frequent_category = DataFrame[ColName].mode()[0]

    # replace nan values with most occured category
    DataFrame[ColName].fillna(most_frequent_category, inplace=True)

In [None]:
for i in train.select_dtypes(include = "object").columns:
    impute_nan_most_frequent_category(train, i)

for i in test.select_dtypes(include = "object").columns:
    impute_nan_most_frequent_category(test, i)

In [None]:
print('Train NA values: \n', train.isnull().sum())
print('Test NA values: \n', test.isnull().sum())

Label Encoding

In [None]:
train.info()

In [None]:
test.info()

In [None]:
objList = train.select_dtypes(include = "object").columns
print('Categorical columns: ', objList)

In [None]:
le = preprocessing.LabelEncoder()
train_encoded = train.copy()
test_encoded = test.copy()
for feat in objList:
    train_encoded[feat] = le.fit_transform(train[feat].astype(str))
    test_encoded[feat] = le.fit_transform(test[feat].astype(str))

In [None]:
train_encoded.info()

In [None]:
test_encoded.info()

In [None]:
X = train_encoded.drop('target', axis=1)
y = train_encoded.target

#### <p style="text-align: center; font-size: 20px">4. Data Upsampling Using SMOTE</p>

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(y.value_counts().index.astype(int),
            y.value_counts().values, palette='bwr')
plt.ylabel('Number of rows', fontsize=12)
plt.xlabel('Target', fontsize=12)
plt.title('Before sampling')
plt.show()

In [None]:
oversample = SMOTE()
smote = SMOTE(random_state = 0)
X_smote, y_smote = smote.fit_resample(X,y)

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(y_smote.value_counts().index.astype(int),
            y_smote.value_counts().values, palette='bwr')
plt.ylabel('Number of rows', fontsize=12)
plt.xlabel('Target', fontsize=12)
plt.title('After sampling')
plt.show()


#### <p style="text-align: center; font-size: 20px">5. Feature selection</p>

In [None]:
#apply SelectKBest class to extract top 10 best features
from sklearn.feature_selection import SelectKBest, f_classif

bestfeatures = SelectKBest(score_func=f_classif, k=9)
fit = bestfeatures.fit(X_smote,y_smote)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

In [None]:
X_smote = X_smote.drop(['company_type', 'gender',
                        'relevent_experience'],
                       axis=1)

test_encoded = test_encoded.drop(['company_type', 'gender',
                                  'relevent_experience'],
                                 axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_smote,
                                                    y_smote,
                                                    test_size=0.2,
                                                    random_state=42)

#### <p style="text-align: center; font-size: 20px">6. Model Selection</p>

In [None]:
rf_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",RandomForestClassifier(random_state=42)) ])
ada_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",AdaBoostClassifier(random_state=42,learning_rate=0.7)) ])
svm_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",SVC(random_state=42,kernel='rbf')) ])
xgb_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",XGBClassifier(random_state=42, n_jobs = 2)) ])

rf_roc_auc_cross_val_scores = cross_val_score(rf_pipe,X_train,y_train,cv=5,scoring='roc_auc')
ada_roc_auc_cross_val_scores=cross_val_score(ada_pipe,X_train,y_train,cv=5,scoring='roc_auc')
svm_roc_auc_cross_val_scores=cross_val_score(svm_pipe,X_train,y_train,cv=5,scoring='roc_auc')
xgb_roc_auc_cross_val_scores=cross_val_score(xgb_pipe,X_train,y_train,cv=5,scoring='roc_auc')

In [None]:
print('Rndmforest validation score: ', rf_roc_auc_cross_val_scores.max())
print('Ada validation score: ', ada_roc_auc_cross_val_scores.max())
print('SVM validation score: ', svm_roc_auc_cross_val_scores.max())
print('Xgb validation score: ', xgb_roc_auc_cross_val_scores.max())

In [None]:
plt.subplot(3,1,1)
ax = sns.lineplot(x=range(0,len(rf_roc_auc_cross_val_scores)),y=rf_roc_auc_cross_val_scores)
ax.set_title('Random Forest Cross Val Scores')
ax.set_xticks([i for i in range(0,len(rf_roc_auc_cross_val_scores))])
ax.set_xlabel('Fold Number')
ax.set_ylabel('Roc_auc Score')
plt.show()

plt.subplot(3,1,2)
ax = sns.lineplot(x=range(0,len(ada_roc_auc_cross_val_scores)),y=ada_roc_auc_cross_val_scores)
ax.set_title('Adaboost Cross Val Scores')
ax.set_xticks([i for i in range(0,len(ada_roc_auc_cross_val_scores))])
ax.set_xlabel('Fold Number')
ax.set_ylabel('Roc_auc Score')
plt.show()

plt.subplot(3,1,3)
ax = sns.lineplot(x=range(0,len(svm_roc_auc_cross_val_scores)),y=svm_roc_auc_cross_val_scores)
ax.set_title('SVM Cross Val Scores')
ax.set_xticks([i for i in range(0,len(svm_roc_auc_cross_val_scores))])
ax.set_xlabel('Fold Number')
ax.set_ylabel('Roc_auc Score')
plt.show()

plt.subplot(3,1,3)
ax = sns.lineplot(x=range(0,len(xgb_roc_auc_cross_val_scores)),y=xgb_roc_auc_cross_val_scores)
ax.set_title('XGB Cross Val Scores')
ax.set_xticks([i for i in range(0,len(xgb_roc_auc_cross_val_scores))])
ax.set_xlabel('Fold Number')
ax.set_ylabel('Roc_auc Score')
plt.show()

#### <p style="text-align: center; font-size: 20px">7. Model Evaluation</p>

In [None]:
rf_pipe.fit(X_train, y_train)
rf_prediction = rf_pipe.predict(X_test)

ada_pipe.fit(X_train, y_train)
ada_prediction = ada_pipe.predict(X_test)

svm_pipe.fit(X_train, y_train)
svm_prediction = svm_pipe.predict(X_test)

xgb_pipe.fit(X_train, y_train)
xgb_prediction = xgb_pipe.predict(X_test)

print('Roc_auc Score of Random Forest Model On Test Set - {}'.format(roc_auc_score(rf_prediction, y_test)))
print('Roc_auc Score of AdaBoost Model On Test Set - {}'.format(roc_auc_score(ada_prediction, y_test)))
print('Roc_auc Score of SVM Model On Test Set - {}'.format(roc_auc_score(svm_prediction, y_test)))
print('Roc_auc Score of XGB Model On Test Set - {}'.format(roc_auc_score(xgb_prediction, y_test)))

#### <p style="text-align: center; font-size: 20px">8. Tuning Hyperparameters </p>

In [None]:
forest  = RandomForestClassifier(random_state = 42)

params = {
        'n_estimators' : [100, 300, 500, 800, 1200],
        'max_depth' : [5, 8, 15, 25, 30],
        'min_samples_split' : [2, 5, 10, 15, 100],
        'min_samples_leaf' : [1, 2, 5, 10] 
        }

gridF = RandomizedSearchCV(forest, params, cv = 5, verbose = 1)

In [None]:
clf_grid = gridF.fit(X_train, y_train)

In [None]:
clf_grid.best_params_

In [None]:
best_est = clf_grid.best_estimator_

In [None]:
y_pred = best_est.predict(X_test)

In [None]:
print('Roc_auc Score of Random Forest Model On Test Set - {}'.format(roc_auc_score(y_pred, y_test)))

#### <p style="text-align: center; font-size: 20px">9. Prediction </p>

In [None]:
model = RandomForestClassifier(n_estimators=800,
                               min_samples_split=12,
                               min_samples_leaf=2, 
                               max_depth=15)

In [None]:
model.fit(X_smote, y_smote)

In [None]:
prediction = model.predict(test_encoded)

In [None]:
sample_submission.target = prediction

In [None]:
sample_submission.head()