In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, recall_score, classification_report


In [2]:
#Loading the single csv file to a variable named 'placement'
placement=pd.read_csv('https://raw.githubusercontent.com/FTDS-learning-materials/phase-1/master/w2/P1W2D4PM%20-%20Model%20Deployment%20-%20Campus.csv')
placement_copy=placement.copy()
placement_copy.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [3]:
placement_copy['salary'].fillna(value=0, inplace=True)
print('Salary column with null values:', placement_copy['salary'].isnull().sum(), sep = '\n')

Salary column with null values:
0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  placement_copy['salary'].fillna(value=0, inplace=True)


In [4]:
placement_copy.drop(['sl_no','ssc_b','hsc_b'], axis = 1,inplace=True)
placement_copy.head()

Unnamed: 0,gender,ssc_p,hsc_p,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,67.0,91.0,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,M,79.33,78.33,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.0,68.0,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,M,56.0,52.0,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
4,M,85.8,73.6,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [6]:
X=placement_copy.drop(['status', 'salary'],axis=1)
y=placement_copy.status

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=1)
print("Input Training:",X_train.shape)
print("Input Test:",X_test.shape)
print("Output Training:",y_train.shape)
print("Output Test:",y_test.shape)

Input Training: (172, 10)
Input Test: (43, 10)
Output Training: (172,)
Output Test: (43,)


In [8]:
label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

In [16]:
cat_col = ['gender', 'workex', 'specialisation', 'hsc_s', 'degree_t']
num_col = ['ssc_p','hsc_p', 'degree_p', 'etest_p', 'mba_p']

In [41]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
cat_pipeline= Pipeline([
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ('onehot',OneHotEncoder()),
    ])
    

pipeline_preproc = ColumnTransformer([
        ("num", num_pipeline, num_col),
        ("cat", cat_pipeline, cat_col),
    ])

X_train_preproc = pipeline_preproc.fit_transform(X_train)
X_test_preproc = pipeline_preproc.transform(X_test)

In [18]:
X_train_preproc

array([[-0.37802451, -0.57006221, -1.30846321, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.2509715 , -0.29435751, -0.05053648, ...,  0.        ,
         0.        ,  1.        ],
       [-0.37802451,  0.07324875, -0.05053648, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.58624017,  0.07324875,  1.62669918, ...,  1.        ,
         0.        ,  0.        ],
       [-0.19831137, -0.12893469,  0.43865726, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.05968066,  0.90036285, -0.10644433, ...,  0.        ,
         0.        ,  1.        ]])

In [42]:
joblib.dump(pipeline_preproc, "pipeline.pkl")

['pipeline.pkl']

In [20]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train_preproc, y_train)

In [25]:
y_test_pred= logreg_model.predict(X_test_preproc)
y_test_pred

array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1])

In [26]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90        15
           1       0.93      0.96      0.95        28

    accuracy                           0.93        43
   macro avg       0.93      0.92      0.92        43
weighted avg       0.93      0.93      0.93        43



In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = [
              {'solver': ['newton-cg', 'lbfgs', 'liblinear'], 
               'C': [0.1, 1.0, 10.0]}
  ]

logreg = LogisticRegression()
grid_search = GridSearchCV(logreg, param_grid, cv=5,
                           scoring='precision',
                           return_train_score=True)

grid_search.fit(X_train_preproc, y_train)

In [31]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)

LogisticRegression(solver='liblinear')
{'C': 1.0, 'solver': 'liblinear'}


In [43]:
final_model= grid_search.best_estimator_

joblib.dump(final_model, "model_aja.pkl")

['model_aja.pkl']

In [35]:
final_model= grid_search.best_estimator_

final_pipeline = Pipeline([
        ("preprocessor", pipeline_preproc),
        ("model", final_model),
    ])

final_pipeline.fit(X_train, y_train)

In [37]:
y_test_pred= final_pipeline.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90        15
           1       0.93      0.96      0.95        28

    accuracy                           0.93        43
   macro avg       0.93      0.92      0.92        43
weighted avg       0.93      0.93      0.93        43



In [38]:

joblib.dump(final_pipeline, "my_model.pkl")

['my_model.pkl']