In [6]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sb

## Read in our Dataset

In [7]:
train = pd.read_csv ("train.csv")
test = pd.read_csv ("test.csv")

In [8]:
train.head()

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,YAK/S/00001,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,NIGER,Yes,Single,No,No,1,0
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0


In [9]:
train["Gender"].unique()

array(['Female', 'Male'], dtype=object)

In [10]:
train.shape

(38312, 19)

In [11]:
train.dtypes

EmployeeNo                              object
Division                                object
Qualification                           object
Gender                                  object
Channel_of_Recruitment                  object
Trainings_Attended                       int64
Year_of_birth                            int64
Last_performance_score                 float64
Year_of_recruitment                      int64
Targets_met                              int64
Previous_Award                           int64
Training_score_average                   int64
State_Of_Origin                         object
Foreign_schooled                        object
Marital_Status                          object
Past_Disciplinary_Action                object
Previous_IntraDepartmental_Movement     object
No_of_previous_employers                object
Promoted_or_Not                          int64
dtype: object

In [12]:
train['EmployeeNo'].nunique()

38312

In [13]:
tst_id = test['EmployeeNo']

train.drop('EmployeeNo',axis=1, inplace=True)
test.drop('EmployeeNo',axis=1, inplace=True)


## ANALYZING ALL THE CATEGORICAL FEATURES

In [14]:
cat_cols=list(train.select_dtypes(include="object").columns)
num_cols=list(train.select_dtypes(include="object").columns)

In [15]:
cat_cols

['Division',
 'Qualification',
 'Gender',
 'Channel_of_Recruitment',
 'State_Of_Origin',
 'Foreign_schooled',
 'Marital_Status',
 'Past_Disciplinary_Action',
 'Previous_IntraDepartmental_Movement',
 'No_of_previous_employers']

In [16]:
for cat in cat_cols:
    print(cat)
    print(train[cat].nunique())
    print("-----------------------")
    

Division
9
-----------------------
Qualification
3
-----------------------
Gender
2
-----------------------
Channel_of_Recruitment
3
-----------------------
State_Of_Origin
37
-----------------------
Foreign_schooled
2
-----------------------
Marital_Status
3
-----------------------
Past_Disciplinary_Action
2
-----------------------
Previous_IntraDepartmental_Movement
2
-----------------------
No_of_previous_employers
7
-----------------------


## Label encoding 3 feature: Division, State of origin, and No of previous employee

In [17]:
labels_cols = ["Division", "State_Of_Origin", "No_of_previous_employers"]

In [18]:
#join train and test set
ntrain = train.shape[0]
ntest = test.shape[0]

combine_data = pd.concat([train, test],ignore_index=True, sort =False)

In [19]:
test.shape

(16496, 17)

## Label encoding using label encoder

In [20]:
from sklearn.preprocessing import LabelEncoder
#create an instance 
lb = LabelEncoder() #create an object before using

for cat in labels_cols: #looping each of the categorical features in te label_cols
    #perform the actual encoding by using fit_transform
    combine_data[cat] = lb.fit_transform(combine_data[cat]) 

In [21]:
combine_data.head()

Unnamed: 0,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,1,"MSc, MBA and PhD",Female,Direct Internal process,2,1986,12.5,2011,1,0,41,3,No,Married,No,No,0,0.0
1,2,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,3,Yes,Married,No,No,0,0.0
2,1,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,20,Yes,Married,No,No,0,0.0
3,1,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,26,Yes,Single,No,No,1,0.0
4,4,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,2,Yes,Married,No,No,1,0.0


## One hot enconding the other cats

In [22]:
combine_data = pd.get_dummies(combine_data)

In [23]:
combine_data.head()

Unnamed: 0,Division,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,No_of_previous_employers,...,Channel_of_Recruitment_Referral and Special candidates,Foreign_schooled_No,Foreign_schooled_Yes,Marital_Status_Married,Marital_Status_Not_Sure,Marital_Status_Single,Past_Disciplinary_Action_No,Past_Disciplinary_Action_Yes,Previous_IntraDepartmental_Movement_No,Previous_IntraDepartmental_Movement_Yes
0,1,2,1986,12.5,2011,1,0,41,3,0,...,0,1,0,1,0,0,1,0,1,0
1,2,2,1991,12.5,2015,0,0,52,3,0,...,0,0,1,1,0,0,1,0,1,0
2,1,2,1987,7.5,2012,0,0,42,20,0,...,0,0,1,1,0,0,1,0,1,0
3,1,3,1982,2.5,2009,0,0,42,26,1,...,0,0,1,0,0,1,1,0,1,0
4,4,3,1990,7.5,2012,0,0,77,2,1,...,0,0,1,1,0,0,1,0,1,0


## spliting back our data 

In [24]:
train = combine_data[0:ntrain]
test = combine_data[ntrain:]


In [25]:
train.shape, test.shape

((38312, 28), (16496, 28))

## Getting and dropping our Target (Promoted or Not)

In [26]:
p_target = train["Promoted_or_Not"]
train.drop("Promoted_or_Not", axis=1, inplace=True)
test.drop("Promoted_or_Not", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


cutting out data for testing with our target/result

In [27]:
#Split data into local train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, p_target, test_size=0.3)

In [28]:
X_train.shape, X_test.shape, y_test.shape, y_train.shape

((26818, 27), (11494, 27), (11494,), (26818,))

## MODELLING

In [29]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score

In [30]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
pred = rfc.predict(X_test)

In [32]:
f1_score(pred, y_test)

0.3277870216306156

In [33]:
gbc = GradientBoostingClassifier(n_estimators=700, max_depth=5)
gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=700,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [34]:
pred = gbc.predict(X_test)

In [35]:
f1_score(pred, y_test)

0.49964513839602553

# Final training and prediction

In [36]:
gbc.fit(train, p_target)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=700,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [37]:
FinalPredtn = gbc.predict(test)

In [38]:
FinalPredtn

array([0., 0., 0., ..., 1., 0., 0.])

In [39]:
sample_sub = pd.read_csv("Silas_Godwin_AAU.csv")

FileNotFoundError: [Errno 2] File b'Silas_Godwin_AAU.csv' does not exist: b'Silas_Godwin_AAU.csv'

In [None]:
sample_sub.head()

In [None]:
sample_sub['EmployeeNo'] = tst_id
sample_sub['Promoted_or_Not'] = FinalPredtn.astype('int64')

In [None]:
sample_sub.head()

In [None]:
sample_sub.to_csv('Silas_Godwin_AAU.csv', index=False)