In [60]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
import xgboost
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as graph
import plotly.tools as tool
%matplotlib inline

ModuleNotFoundError: No module named 'imblearn'

# Load dataset

In [15]:
dataset = pd.read_csv('/Users/sandeepmishra/Downloads/Qt4_csv.csv')

In [16]:
dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,No,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,Yes,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,Yes,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


# Checking null values

In [17]:
dataset.isnull().any()

Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesL

## Extracting numeric and categorical columns

### Preparing target column

In [39]:
class_attrition = {'Yes':1, 'No':0}
target = dataset["Attrition"].apply(lambda x: class_attrition[x])
dataset["Class_Attrition"] = target

In [33]:
all_columns = dataset.columns
all_columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Class_Attrition'],
      dtype='object')

In [34]:
num_columns = dataset._get_numeric_data().columns
num_columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Class_Attrition'],
      dtype='object')

In [35]:
cat_columns = list(set(all_columns) - set(num_columns))
cat_columns

['JobRole',
 'Department',
 'BusinessTravel',
 'Attrition',
 'OverTime',
 'MaritalStatus',
 'Gender',
 'EducationField',
 'Over18']

# Checking correlation fo features

In [36]:
graphobj = graph.Heatmap(z= dataset[num_columns].astype(float).corr().values,x=dataset[num_columns].columns.values,y=dataset[num_columns].columns.values,colorscale='Viridis',reversescale = False,text = True ,opacity = 1.0)
graph_data = [graphobj]
graph_layout = go.Layout(title='Pearson Correlation of numerical features',xaxis = dict(ticks='', nticks=36),yaxis = dict(ticks='' ),width = 900, height = 700,)

figure = go.Figure(data=graph_data, layout=graph_layout)
py.iplot(figure, filename='feature-labelled')


## Selecting features and onehot assignment of categorical features and cancatination of numerical and categorical features

In [40]:
cat_dataset = dataset[cat_columns]
cat_dataset = cat_dataset.drop(['Attrition'], axis=1)

cat_dataset = pd.get_dummies(cat_dataset)
cat_dataset.head(5)

Unnamed: 0,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,Department_Human Resources,...,MaritalStatus_Single,Gender_Female,Gender_Male,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Over18_Y
0,0,0,0,0,0,0,0,1,0,0,...,1,1,0,0,1,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
3,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1


In [41]:
num_dataset = dataset[num_columns]
num_dataset = num_dataset.drop(['Class_Attrition'], axis=1)
num_dataset.head(5)

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,1,2,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1,40,3,1,...,4,80,1,6,3,3,2,2,2,2


In [52]:
filtered_dataset = pd.concat([num_dataset, cat_dataset], axis=1)

## Target class analysis 

In [50]:
target_graph = graph.Bar(x=dataset["Attrition"].value_counts().index.values,y= dataset["Attrition"].value_counts().values)
py.iplot([target_graph], filename='target_class_ration')


#### Need to do oversampling as target class is very skewed

## Model implementation

In [53]:
train_dataset, test_dataset, target_train, target_val = train_test_split(filtered_dataset, target, train_size= 0.7,random_state=3);

In [59]:
over_sampling = SMOTE(random_state=0);
smote_train_dataset, smote_target_dataset = oversampler.fit_sample(train_dataset,target_train);

NameError: name 'SMOTE' is not defined

### Randome forest classifier

In [56]:
random_forest = RandomForestClassifier()
random_forest.fit(train_dataset, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [57]:
random_forest_predictions = random_forest.predict(test_dataset)
accuracy_score(target_val, random_forest_predictions)

0.9160997732426304

In [62]:
!pip install imblearn
from imblearn.over_sampling import SMOTE

Collecting imblearn
  Using cached https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl
Installing collected packages: imblearn
Successfully installed imblearn-0.0


ModuleNotFoundError: No module named 'imblearn'