In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
data = pd.read_csv("/content/HR_comma_sep.csv")
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
data= pd.get_dummies(data)

In [4]:
x = data.drop('left',axis=1)

In [5]:
y =  data['left']

In [6]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.30,random_state=42,stratify = y)

In [7]:
LR_Model = Pipeline([('scalar',StandardScaler()),
                     ('pca',PCA(n_components=10)),
                     ('model',LogisticRegression(random_state=2))])


In [8]:
DT_Model = Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=10)),
                     ('model',DecisionTreeClassifier(random_state=2))])

In [9]:
RFC = Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=10)),
                     ('model',RandomForestClassifier(random_state=2))])

In [10]:
pipeline = [LR_Model,DT_Model,RFC]

In [11]:
Best_accuracy = 0.0
Best_classifier = 0
Best_pipeline = "" 

In [12]:
pip_dic = {0:'LogisticRegression',1:'DecisionTreeClassifier',2:'RandomForestClassifier'}

In [13]:
for pipe in pipeline:
  pipe.fit(x_train,y_train)

In [14]:
for i,model in enumerate(pipeline):
  print("{}Test Accuracy : {}".format(pip_dic[i],model.score(x_train,y_train)))

LogisticRegressionTest Accuracy : 0.7789313267930279
DecisionTreeClassifierTest Accuracy : 1.0
RandomForestClassifierTest Accuracy : 1.0


In [15]:
for i,model in enumerate(pipeline):
  if model.score(x_test,y_test)>Best_accuracy:
    Best_accuracy=model.score(x_test,y_test)
    Best_pipeline = model
    Best_classifier = i
print('Best Classifier with best accuracy :{}'.format(pip_dic[Best_classifier]))

Best Classifier with best accuracy :RandomForestClassifier


In [16]:
pipe_new = Pipeline([('classifier',RandomForestClassifier())])

In [19]:
Get_param = [{'classifier':[LogisticRegression()],
              'classifier__penalty':['L1,L2','elasticnet', 'none'],
              'classifier__C':np.logspace(0,4,10)},
             
            {'classifier':[LogisticRegression()],
              'classifier__penalty':['L2'],
              'classifier__C':np.logspace(0,4,10),
               'classifier__solver':['newton-cg','saga','sag','liblinear']}]

              # {'classifier':[RandomForestClassifier()],
              # 'classifier_n_estimators' : [24,50,75,100],
              # 'classifier_criterion' :['gini','entropy'],
              # 'classifier_max_depth' :[3,5,10]}]

             
            #  {'classifier':[RandomForestClassifier()],
            #  'classifier__estimators': [4, 6, 9], 
            #   'classifier__criterion': ['entropy', 'gini'], 'classifier__max_depth': [2, 3, 5, 10], 
            #   'classifier__min_samples_split': [2, 3, 5],'classifier__min_samples_leaf': [1,5,8]}]

             
            #  {'classifier':[RandomForestClassifier()],
            #   'classifier_n_estimator':[10,100,200],
            #   'classifier_max_depth':[5,8,10,15,25,30,50,100,200,210,None],
            #   'classifier_minimum_sample_leafs':[1,2,5,10,15,50,100,200,210,230],
            #   'classifier_maximum_leafs_node':[2,5,10]}]

In [20]:
grid = GridSearchCV(pipe_new,Get_param,cv=5,verbose=0,n_jobs=-1)
best_model =grid.fit(x_train,y_train)

In [21]:
best_model.best_params_

{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='none',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'classifier__C': 1.0,
 'classifier__penalty': 'none'}

In [27]:
LR_model = LogisticRegression(solver='saga',C=1.0)

In [28]:
LR_model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
y_pred = LR_Model.predict(x_test)

In [30]:
accuracy_score(y_test,y_pred)

0.7826666666666666