In [33]:
import pandas as pd

In [34]:
hr_data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [35]:
hr_data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [36]:
hr_data.shape

(1470, 35)

In [37]:
hr_data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [38]:
hr_data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [39]:
df = hr_data.dropna()
print(df)

      Age Attrition     BusinessTravel  DailyRate              Department  \
0      41       Yes      Travel_Rarely       1102                   Sales   
1      49        No  Travel_Frequently        279  Research & Development   
2      37       Yes      Travel_Rarely       1373  Research & Development   
3      33        No  Travel_Frequently       1392  Research & Development   
4      27        No      Travel_Rarely        591  Research & Development   
...   ...       ...                ...        ...                     ...   
1465   36        No  Travel_Frequently        884  Research & Development   
1466   39        No      Travel_Rarely        613  Research & Development   
1467   27        No      Travel_Rarely        155  Research & Development   
1468   49        No  Travel_Frequently       1023                   Sales   
1469   34        No      Travel_Rarely        628  Research & Development   

      DistanceFromHome  Education EducationField  EmployeeCount  \
0       

In [40]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop single-value columns and id columns
    df = df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1)
    
    # Binary-encode binary columns
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
    df['OverTime'] = df['OverTime'].replace({'No': 0, 'Yes': 1})
    
    # Ordinal-encode the BusinessTravel column
    df['BusinessTravel'] = df['BusinessTravel'].replace({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})
    
    # One-hot encoding
    for column in ['Department', 'EducationField', 'JobRole', 'MaritalStatus']:
        df = onehot_encode(df, column=column)
    
    # Split df into X and y
    y = df['Attrition']
    X = df.drop('Attrition', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [42]:
X_train, X_test, y_train, y_test = preprocess_inputs(hr_data)

In [43]:
X_test

Unnamed: 0,Age,BusinessTravel,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
1291,-0.000742,-0.141311,-1.109428,0.112903,1.078220,1.165128,0.843888,-0.387850,0.384204,-0.063108,...,-0.469724,-0.255377,2.982320,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,-0.918742,1.448930
1153,-2.071956,1.722911,-0.681171,-0.764379,-0.892452,-0.669480,-1.184992,0.203698,0.384204,-0.965030,...,-0.469724,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,4.208217,-0.530514,-0.918742,1.448930
720,-0.763821,-0.141311,-1.680436,1.616814,0.092884,-1.586784,-1.184992,-0.880807,0.384204,-0.965030,...,-0.469724,-0.255377,-0.335309,-0.255377,2.017168,-0.533521,-0.237630,-0.530514,1.088445,-0.690165
763,-0.327775,-0.141311,1.260750,0.112903,1.078220,0.247824,-1.184992,1.041724,0.384204,-0.965030,...,-0.469724,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,4.208217,-0.530514,1.088445,-0.690165
976,2.070473,-0.141311,1.349355,1.742140,0.092884,1.165128,0.843888,0.105107,0.384204,1.740736,...,-0.469724,-0.255377,2.982320,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,1.088445,-0.690165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1398,-0.436787,1.722911,1.186912,-0.263075,-0.892452,1.165128,0.843888,-1.472354,0.384204,-0.063108,...,-0.469724,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,1.884966,-0.918742,-0.690165
351,-0.545798,-0.141311,0.593753,-0.889705,0.092884,0.247824,-1.184992,0.450176,0.384204,-0.965030,...,2.128910,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,1.088445,-0.690165
41,-1.090854,-0.141311,1.031854,-0.889705,1.078220,1.165128,-1.184992,-1.620241,0.384204,-0.965030,...,2.128910,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,1.884966,-0.918742,-0.690165
1210,-0.436787,-0.141311,-1.362936,1.491488,0.092884,-0.669480,0.843888,0.647359,1.766532,-0.965030,...,2.128910,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,1.088445,-0.690165


In [44]:
y_test

1291    Yes
1153    Yes
720     Yes
763      No
976      No
       ... 
1398     No
351      No
41       No
1210     No
1256     No
Name: Attrition, Length: 441, dtype: object

In [45]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score, accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=42);
logreg.fit(X_train, y_train)
y_prediction = logreg.predict(X_test)

In [46]:
y_predict = logreg.predict(X_train) # — — — — — — — — -Prediction here are 0s and 1s
# — — — — — — — to get probability values use this snippet
y_predict_prob = logreg.predict_proba(X_train)
confusion_matrix(y_train,y_predict)
#-----------------------------------Regression Score--------------------------------------
print("Trainig accuracy",logreg.score(X_train,y_train))  
print()
print("Testing accuracy",logreg.score(X_test, y_test))
print()

Trainig accuracy 0.8979591836734694

Testing accuracy 0.8662131519274376



In [47]:
#Therefore final model is
model = LogisticRegression(random_state=42,penalty='l2', class_weight='balanced',C=0.25)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [48]:
y_predict

array(['No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No',
       'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes',
       'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No',
       'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No',
       'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No',
       'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
  

In [49]:
X_train

Unnamed: 0,Age,BusinessTravel,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
99,0.762337,-2.005534,-0.816540,1.742140,0.092884,-0.669480,0.843888,0.055811,0.384204,-0.063108,...,2.128910,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,1.088445,-0.690165
785,0.326292,-0.141311,1.652087,1.366162,1.078220,-1.586784,0.843888,-0.239963,0.384204,0.838814,...,-0.469724,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,1.088445,-0.690165
918,1.525416,1.722911,-1.436773,-0.012423,0.092884,1.165128,0.843888,0.844541,0.384204,2.642657,...,-0.469724,3.915780,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,1.884966,-0.918742,-0.690165
1335,0.217281,-0.141311,0.035051,1.240836,1.078220,1.165128,0.843888,-1.225876,0.384204,-0.063108,...,-0.469724,-0.255377,-0.335309,-0.255377,2.017168,-0.533521,-0.237630,1.884966,-0.918742,-0.690165
1182,-0.109753,-2.005534,0.180264,-1.015031,1.078220,1.165128,-1.184992,-1.620241,-0.998124,-0.063108,...,-0.469724,-0.255377,2.982320,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,1.088445,-0.690165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,-0.436787,1.722911,0.015361,-1.015031,1.078220,0.247824,-1.184992,0.893837,1.766532,-0.063108,...,-0.469724,-0.255377,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,1.088445,-0.690165
905,-0.872832,-0.141311,-0.311985,-1.015031,0.092884,1.165128,-1.184992,1.041724,-0.998124,1.740736,...,-0.469724,-0.255377,-0.335309,3.915780,-0.495745,-0.533521,-0.237630,1.884966,-0.918742,-0.690165
1096,0.326292,-0.141311,0.190109,-0.388401,-0.892452,0.247824,0.843888,-1.373763,0.384204,1.740736,...,-0.469724,3.915780,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,-0.918742,1.448930
235,0.653326,-0.141311,0.524838,0.864859,0.092884,1.165128,-1.184992,0.696654,0.384204,1.740736,...,-0.469724,3.915780,-0.335309,-0.255377,-0.495745,-0.533521,-0.237630,-0.530514,1.088445,-0.690165


In [51]:
import pickle
pickle.dump(y_prediction, open('pickel1', 'wb'))