# Classifier Evaluation Lab

* Copy&paste your model for homework5 model
* Add grid search and train
* Compare performance
* Which one is better? Explain?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/log_reg/employee-turnover-balanced.csv')
df.head()

Unnamed: 0,left_company,age,frequency_of_travel,department,commuting_distance,education,satisfaction_with_environment,gender,seniority_level,position,satisfaction_with_job,married_or_single,last_raise_pct,last_performance_rating,total_years_working,years_at_company,years_in_current_job,years_since_last_promotion,years_with_current_supervisor
0,No,37,Travel_Rarely,Sales,16,4,4,Male,2,Sales Executive,3,Divorced,19,3,9,1,0,0,0
1,No,39,Travel_Rarely,Research & Development,3,2,3,Male,2,Laboratory Technician,3,Divorced,15,3,11,10,8,0,7
2,No,52,Travel_Frequently,Research & Development,25,4,3,Female,4,Manufacturing Director,4,Married,22,4,31,9,8,0,0
3,No,50,Non-Travel,Sales,1,3,4,Female,2,Sales Executive,3,Married,12,3,19,18,7,0,13
4,No,44,Travel_Rarely,Research & Development,4,3,4,Male,2,Healthcare Representative,2,Single,12,3,10,5,2,2,3


In [2]:
numerical_vars = ['age','commuting_distance','last_raise_pct','total_years_working','years_at_company','years_in_current_job','years_since_last_promotion','years_with_current_supervisor']
categorical_vars = ['frequency_of_travel', 'department','education','satisfaction_with_environment','gender','seniority_level','position','satisfaction_with_job','married_or_single','last_performance_rating']

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop(['years_at_company','left_company','years_in_current_job'],axis='columns')
y = df['left_company']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=124)

In [4]:
# created new numerical features list after droping 'years_at_company' and 'years_in_current_job' features
new_numerical_vars = ['age','commuting_distance','last_raise_pct','total_years_working','years_since_last_promotion','years_with_current_supervisor']

In [5]:
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [6]:
numerical_pipeline = Pipeline([('impute_missing', SimpleImputer(strategy='median')),
                           ('standardize_num', StandardScaler())
                        ])
set_config(display='diagram')
numerical_pipeline

In [7]:
transformed_numerical_rows_columns = numerical_pipeline.fit_transform(X_train[new_numerical_vars]).shape
actual_numerical_rows_columns = X_train[new_numerical_vars].shape
print("Actual numerical rows and columns:",actual_numerical_rows_columns)
print("Transformed numerical rows and columns:",transformed_numerical_rows_columns)

Actual numerical rows and columns: (800, 6)
Transformed numerical rows and columns: (800, 6)


In [8]:
categorical_pipeline = Pipeline([('impute_missing_cats', SimpleImputer(strategy='most_frequent')),
                          ('create_dummies_cats', OneHotEncoder(handle_unknown='ignore', drop='first'))])
set_config(display='diagram')
categorical_pipeline

In [9]:
transformed_categorical_columns = categorical_pipeline.fit_transform(X_train[categorical_vars]).shape[1]

# the columns that are created after passing into OneHotEncoder
actual_categorical_columns = categorical_pipeline.named_steps['create_dummies_cats'].get_feature_names_out().shape[0]

print("Actual categorical columns:",actual_categorical_columns)
print("Transformed categorical columns:",transformed_categorical_columns)

Actual categorical columns: 30
Transformed categorical columns: 30


In [10]:
transformed_categorical_columns == actual_categorical_columns

True

In [11]:
from sklearn.compose import ColumnTransformer
processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', numerical_pipeline, new_numerical_vars),
                                                      ('create_dummies', categorical_pipeline, categorical_vars)])

processing_pipeline

In [12]:
from sklearn.linear_model import LogisticRegression

modeling_pipeline = Pipeline([('data_processing', processing_pipeline), 
                              ('lm', LogisticRegression())
                             ])
modeling_pipeline.fit(X_train, y_train)

In [13]:
from sklearn.metrics import classification_report
predictions_train = modeling_pipeline.predict(X_train)
training_score = classification_report(y_train, predictions_train)
print('Scores on Training Data\n')
print(training_score)

Scores on Training Data

              precision    recall  f1-score   support

          No       0.77      0.75      0.76       407
         Yes       0.75      0.77      0.76       393

    accuracy                           0.76       800
   macro avg       0.76      0.76      0.76       800
weighted avg       0.76      0.76      0.76       800



In [14]:
predictions_test = modeling_pipeline.predict(X_test)
testing_score = classification_report(y_test, predictions_test)
print('Scores on Testing Data\n')
print(testing_score)

Scores on Testing Data

              precision    recall  f1-score   support

          No       0.68      0.63      0.66        93
         Yes       0.70      0.74      0.72       107

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.69       200
weighted avg       0.69      0.69      0.69       200



# Applied GridSearchCv

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid =  {
    'lm__penalty' : ['l2'],
    'lm__C'       : [0.01, 0.1, 1],
    'lm__solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
    #'lm__weight': [None, 'balanced']
}
grid_search = GridSearchCV(estimator=modeling_pipeline, param_grid=param_grid, cv=10,scoring='accuracy')
gcv = grid_search.fit(X_train, y_train)

In [17]:
y_pred_gcv_train = gcv.predict(X_train)

In [18]:
training_score_gcv = classification_report(y_train, y_pred_gcv_train)
print('Scores on Training Data with GCV\n')
print(training_score_gcv)

Scores on Training Data with GCV

              precision    recall  f1-score   support

          No       0.77      0.75      0.76       407
         Yes       0.75      0.77      0.76       393

    accuracy                           0.76       800
   macro avg       0.76      0.76      0.76       800
weighted avg       0.76      0.76      0.76       800



In [19]:
y_pred_gcv_test = gcv.predict(X_test)

In [20]:
testing_score_gcv = classification_report(y_test, y_pred_gcv_test)
print('Scores on Testing Data with GCV\n')
print(testing_score_gcv)

Scores on Testing Data with GCV

              precision    recall  f1-score   support

          No       0.67      0.63      0.65        93
         Yes       0.70      0.73      0.71       107

    accuracy                           0.69       200
   macro avg       0.68      0.68      0.68       200
weighted avg       0.68      0.69      0.68       200



In [16]:
print("Best parameters:", gcv.best_params_)
print("Best score:", gcv.best_score_)

Best parameters: {'lm__C': 1, 'lm__penalty': 'l2', 'lm__solver': 'liblinear'}
Best score: 0.7125000000000001


- GridSearchCV has not broght any drastic noticeable change, score has changed by 2%
- The accuracy is almost same 
- precision  and recall are also almost same 
- After the tuning the model, the GridSearchCV gave best paramaters