In [262]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
%matplotlib inline
sns.set_style('white')

# About the Data

Context
The data pertains to the recruitment industry in India for the years 2014-2016 and deals with candidate interview attendance for various clients. The details are largely self explanatory.

Content
The data have been collected by me and my fellow researchers over a period of over 2 years between September 2014 and January 2017.

There are a set of questions that are asked by a recruiter while scheduling the candidate. The answers to these determine whether expected attendance is yes, no or uncertain. 

The dataset can be found at https://www.kaggle.com/vishnusraghavan/the-interview-attendance-problem

# Preparing the Data

In [263]:
df = pd.read_csv("Interview.csv")

In [264]:
df.head()

Unnamed: 0,Date of Interview,Client name,Industry,Location,Position to be closed,Nature of Skillset,Interview Type,Name(Cand ID),Gender,Candidate Current Location,...,Are you clear with the venue details and the landmark.,Has the call letter been shared,Expected Attendance,Observed Attendance,Marital Status,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,13.02.2015,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 1,Male,Chennai,...,Yes,Yes,Yes,No,Single,,,,,
1,13.02.2015,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 2,Male,Chennai,...,Yes,Yes,Yes,No,Single,,,,,
2,13.02.2015,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 3,Male,Chennai,...,,,Uncertain,No,Single,,,,,
3,13.02.2015,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 4,Male,Chennai,...,Yes,Yes,Uncertain,No,Single,,,,,
4,13.02.2015,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 5,Male,Chennai,...,Yes,Yes,Uncertain,No,Married,,,,,


In [265]:
df = df.drop(['Date of Interview', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27'], 
             axis='columns')
df.head()

Unnamed: 0,Client name,Industry,Location,Position to be closed,Nature of Skillset,Interview Type,Name(Cand ID),Gender,Candidate Current Location,Candidate Job Location,...,Have you obtained the necessary permission to start at the required time,Hope there will be no unscheduled meetings,Can I Call you three hours before the interview and follow up on your attendance for the interview,Can I have an alternative number/ desk number. I assure you that I will not trouble you too much,Have you taken a printout of your updated resume. Have you read the JD and understood the same,Are you clear with the venue details and the landmark.,Has the call letter been shared,Expected Attendance,Observed Attendance,Marital Status
0,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 1,Male,Chennai,Hosur,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Single
1,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 2,Male,Chennai,Bangalore,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Single
2,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 3,Male,Chennai,Chennai,...,,Na,,,,,,Uncertain,No,Single
3,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 4,Male,Chennai,Chennai,...,Yes,Yes,No,Yes,No,Yes,Yes,Uncertain,No,Single
4,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 5,Male,Chennai,Bangalore,...,Yes,Yes,Yes,No,Yes,Yes,Yes,Uncertain,No,Married


In [266]:
df.columns = ['client_name', 'industry', 'location', 'position', 'skillset_nature', 'interview_type', 'candidate_id',
             'gender', 'cand_current_loc', 'cand_job_loc', 'interview_venue', 'candidate_native_loc', 'q1', 'q2',
             'q3', 'q4', 'q5', 'q6', 'q7', 'expected_attendance', 'observed_attendance', 'marital_status']
df.head()

Unnamed: 0,client_name,industry,location,position,skillset_nature,interview_type,candidate_id,gender,cand_current_loc,cand_job_loc,...,q1,q2,q3,q4,q5,q6,q7,expected_attendance,observed_attendance,marital_status
0,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 1,Male,Chennai,Hosur,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Single
1,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 2,Male,Chennai,Bangalore,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Single
2,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 3,Male,Chennai,Chennai,...,,Na,,,,,,Uncertain,No,Single
3,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 4,Male,Chennai,Chennai,...,Yes,Yes,No,Yes,No,Yes,Yes,Uncertain,No,Single
4,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 5,Male,Chennai,Bangalore,...,Yes,Yes,Yes,No,Yes,Yes,Yes,Uncertain,No,Married


In [267]:
df.fillna("no", inplace=True)
df.head()

Unnamed: 0,client_name,industry,location,position,skillset_nature,interview_type,candidate_id,gender,cand_current_loc,cand_job_loc,...,q1,q2,q3,q4,q5,q6,q7,expected_attendance,observed_attendance,marital_status
0,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 1,Male,Chennai,Hosur,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Single
1,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 2,Male,Chennai,Bangalore,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Single
2,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 3,Male,Chennai,Chennai,...,no,Na,no,no,no,no,no,Uncertain,No,Single
3,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 4,Male,Chennai,Chennai,...,Yes,Yes,No,Yes,No,Yes,Yes,Uncertain,No,Single
4,Hospira,Pharmaceuticals,Chennai,Production- Sterile,Routine,Scheduled Walkin,Candidate 5,Male,Chennai,Bangalore,...,Yes,Yes,Yes,No,Yes,Yes,Yes,Uncertain,No,Married


In [268]:
df = df.apply(lambda x: x.astype(str).str.lower())
df = df.apply(lambda x: x.astype(str).str.strip())
df.head()

Unnamed: 0,client_name,industry,location,position,skillset_nature,interview_type,candidate_id,gender,cand_current_loc,cand_job_loc,...,q1,q2,q3,q4,q5,q6,q7,expected_attendance,observed_attendance,marital_status
0,hospira,pharmaceuticals,chennai,production- sterile,routine,scheduled walkin,candidate 1,male,chennai,hosur,...,yes,yes,yes,yes,yes,yes,yes,yes,no,single
1,hospira,pharmaceuticals,chennai,production- sterile,routine,scheduled walkin,candidate 2,male,chennai,bangalore,...,yes,yes,yes,yes,yes,yes,yes,yes,no,single
2,hospira,pharmaceuticals,chennai,production- sterile,routine,scheduled walkin,candidate 3,male,chennai,chennai,...,no,na,no,no,no,no,no,uncertain,no,single
3,hospira,pharmaceuticals,chennai,production- sterile,routine,scheduled walkin,candidate 4,male,chennai,chennai,...,yes,yes,no,yes,no,yes,yes,uncertain,no,single
4,hospira,pharmaceuticals,chennai,production- sterile,routine,scheduled walkin,candidate 5,male,chennai,bangalore,...,yes,yes,yes,no,yes,yes,yes,uncertain,no,married


In [269]:
from sklearn.preprocessing import LabelEncoder

In [270]:
df.observed_attendance.value_counts()

yes    783
no     451
Name: observed_attendance, dtype: int64

In [271]:
le_client_name = LabelEncoder()
le_industry = LabelEncoder()
le_location = LabelEncoder()
le_position = LabelEncoder()
le_skillset_nature = LabelEncoder()
le_interview_type = LabelEncoder()
le_candidate_id = LabelEncoder()
le_gender = LabelEncoder()
le_cand_current_loc = LabelEncoder()
le_cand_job_loc = LabelEncoder()
le_interview_venue = LabelEncoder()
le_candidate_native_loc = LabelEncoder()
le_q1 = LabelEncoder()
le_q2 = LabelEncoder()
le_q3 = LabelEncoder()
le_q4 = LabelEncoder()
le_q5 = LabelEncoder()
le_q6 = LabelEncoder()
le_q7 = LabelEncoder()
le_expected_attendance = LabelEncoder()
le_observed_attendance = LabelEncoder()
le_marital_status = LabelEncoder()


In [272]:
df['client_name_n'] = le_client_name.fit_transform(df['client_name'])
df['industry_n'] = le_industry.fit_transform(df['industry'])
df['location_n'] = le_location.fit_transform(df['location'])
df['position_n'] = le_position.fit_transform(df['position'])
df['skillset_nature_n'] = le_skillset_nature.fit_transform(df['skillset_nature'])
df['interview_type_n'] = le_interview_type.fit_transform(df['interview_type'])
df['candidate_id_n'] = le_candidate_id.fit_transform(df['client_name'])
df['gender_n'] = le_gender.fit_transform(df['gender'])
df['cand_current_loc_n'] = le_cand_current_loc.fit_transform(df['cand_current_loc'])
df['cand_job_loc_n'] = le_cand_job_loc.fit_transform(df['cand_job_loc'])
df['interview_venue_n'] = le_interview_venue.fit_transform(df['interview_venue'])
df['candidate_native_loc_n'] = le_candidate_native_loc.fit_transform(df['candidate_native_loc'])
df['q1_n'] = le_q1.fit_transform(df['q1'])
df['q2_n'] = le_q2.fit_transform(df['q2'])
df['q3_n'] = le_q3.fit_transform(df['q3'])
df['q4_n'] = le_q4.fit_transform(df['q4'])
df['q5_n'] = le_q5.fit_transform(df['q5'])
df['q6_n'] = le_q6.fit_transform(df['q6'])
df['q7_n'] = le_q7.fit_transform(df['q7'])
df['expected_attendance_n'] = le_expected_attendance.fit_transform(df['expected_attendance'])
df['observed_attendance_n'] = le_observed_attendance.fit_transform(df['observed_attendance'])
df['marital_status_n'] = le_marital_status.fit_transform(df['marital_status'])


In [274]:
jobs = df.drop(['client_name', 'industry', 'location', 'position', 'skillset_nature', 'interview_type', 'candidate_id',
             'gender', 'cand_current_loc', 'cand_job_loc', 'interview_venue', 'candidate_native_loc', 'q1', 'q2',
             'q3', 'q4', 'q5', 'q6', 'q7', 'expected_attendance', 'observed_attendance', 'marital_status'], 
               axis='columns')
jobs.head(1)

Unnamed: 0,client_name_n,industry_n,location_n,position_n,skillset_nature_n,interview_type_n,candidate_id_n,gender_n,cand_current_loc_n,cand_job_loc_n,...,q1_n,q2_n,q3_n,q4_n,q5_n,q6_n,q7_n,expected_attendance_n,observed_attendance_n,marital_status_n
0,7,6,2,4,64,4,7,1,2,4,...,3,4,3,3,4,3,6,4,0,2


In [276]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X = jobs.drop('observed_attendance_n', 1)

Y = jobs['observed_attendance_n']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)

# Vanilla Logistic Regression

In [316]:
import statsmodels.api as sm
# Declare predictors.
X_statsmod = jobs.drop('observed_attendance_n', 1)

# The Statsmodels formulation requires a column with constant value 1 that
# will act as the intercept.
X_statsmod['intercept'] = 1 

# Declare and fit the model.
logit = sm.Logit(jobs['observed_attendance_n'], X_statsmod)
result = logit.fit(method='bfgs')

# Lots of information about the model and its coefficients, but the
# accuracy rate for predictions is missing.
print(result.summary())

         Current function value: 0.560628
         Iterations: 35
         Function evaluations: 39
         Gradient evaluations: 39
                             Logit Regression Results                            
Dep. Variable:     observed_attendance_n   No. Observations:                 1234
Model:                             Logit   Df Residuals:                     1213
Method:                              MLE   Df Model:                           20
Date:                   Sun, 05 May 2019   Pseudo R-squ.:                  0.1460
Time:                           20:04:38   Log-Likelihood:                -691.82
converged:                         False   LL-Null:                       -810.13
                                           LLR p-value:                 5.614e-39
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
client_name_n             -0



In [315]:
# Calculate accuracy. First, get probability that each row will be admitted.
pred_statsmod = result.predict(X_statsmod)

# Code admission as 1 if probability is greater than .5.
pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1)

# Accuracy table.
table = pd.crosstab(jobs['observed_attendance_n'], pred_y_statsmod)

print('\n Model Accuracy')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))


 Model Accuracy
col_0                    0    1
observed_attendance_n          
0                      186  265
1                       89  694

 Percentage accuracy
0.713128038897893


# Ridge Regression


In [297]:
rr = LogisticRegression(penalty='l2')

# Fit the model.
fit = rr.fit(X, Y)

print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Model Accuracy')
print(pd.crosstab(pred_y_sklearn, Y))

print('\n Percentage accuracy')
print(rr.score(X, Y))

Coefficients
[[-6.38345410e-02 -1.50939856e-01  9.63954564e-01 -2.68305180e-01
   2.84256406e-04 -2.18076868e-01 -6.38345410e-02 -8.33723679e-02
  -1.07017736e+00  2.71490176e-01 -3.32144973e-01  3.27450094e-03
   4.33204752e-02  1.36613284e-01 -3.61310706e-01  4.46849030e-02
   1.09744775e-01 -3.02093231e-01  9.01453324e-02  1.51339168e+00
  -3.12362487e-03]]
[-1.19965552]

 Model Accuracy
observed_attendance_n    0    1
row_0                          
0                      188   89
1                      263  694

 Percentage accuracy
0.7155591572123177




# Lasso Regression

In [298]:
lasr = LogisticRegression(penalty='l1')

# Fit the model.
fit = lasr.fit(X, Y)

print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lasr.predict(X)

print('\n Model Accuracy')
print(pd.crosstab(pred_y_sklearn, Y))

print('\n Percentage accuracy')
print(lasr.score(X, Y))

Coefficients
[[-5.06356897e-02 -1.40869870e-01  8.01413506e-01 -2.57639728e-01
   5.91110173e-05 -2.10787616e-01 -6.55886523e-02 -5.54922518e-02
  -8.88400242e-01  2.33553482e-01 -2.75107385e-01  3.34336537e-03
   0.00000000e+00  7.58642394e-02 -2.26687534e-01  0.00000000e+00
   4.41042813e-02 -1.93869175e-01  6.84809326e-02  1.57466077e+00
   0.00000000e+00]]
[-1.55207034]

 Model Accuracy
observed_attendance_n    0    1
row_0                          
0                      185   86
1                      266  697

 Percentage accuracy
0.7147487844408428




# Conclusion

All 3 Logistic Regression Models had roughly the same accuracy of 71-72%. The most accurate was Ridge Regression with 71.5% then Lasso Regression with 71.4% and finally, Vanilla Logistic Regession with 71.3%.