In [161]:
# Importing all the required libraries

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier

In [162]:
# Loading the datasets
train = pd.read_csv('aug_train.csv')

In [163]:
#removing 'enrollee_id' from data since it is not required in model
train.drop(['enrollee_id','city'],axis=1,inplace=True)

### Separating independent and dependent variables

In [164]:
X = train.drop(columns=['target'])

y = train['target']

In [165]:
# Encoding categorical variables
X_cats = (OneHotEncoder(sparse=False,handle_unknown='ignore')
                   .fit_transform(X[['gender','relevent_experience',
                           'enrolled_university','education_level',
                           'major_discipline','company_type',
                           'last_new_job','experience','company_size']]))
X_cats = pd.DataFrame(X_cats)

In [166]:
# Merging encoded categorical variables with numeric variables
X_numerical = X.drop(columns=['gender','relevent_experience',
                                  'enrolled_university','education_level',
                                 'major_discipline','company_type',
                                  'last_new_job','experience','company_size'])
col_names = X_numerical.columns
X_numerical = pd.DataFrame(X_numerical, columns=col_names)
X = X_numerical.join(X_cats)

In [167]:
# Since the Target has "0" far more than "1" we will just fill NA values with 0 for this problem
X.fillna(0, inplace=True)

In [168]:
# Using SMOTE for handling class imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 0)
X.columns = X.columns.astype('str')

X_smote, y_smote = smote.fit_resample(X,y)


In [169]:
# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_smote,
                                                    y_smote,
                                                    test_size=0.2,
                                                    random_state=101)

#### From the above output, we can conclude that RandomForest is the best algorithm to go forward

In [172]:
# Running model on entire data 
#model = RandomForestClassifier(n_estimators=800,
#                               min_samples_split=10,
#                               min_samples_leaf=2, 
#                               max_depth=30)
model = RandomForestClassifier(n_estimators=800 , oob_score = True, n_jobs = -1,
                                  random_state =50, max_features = "auto",
                                  max_leaf_nodes = 30)
model.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=30, n_estimators=800, n_jobs=-1,
                       oob_score=True, random_state=50)

In [173]:
import pickle

# Save the model to a file
with open("model_hr.pkl", "wb") as f:
    pickle.dump(model, f)