## Retriving Data

In [85]:
import pandas as pd
pd.options.mode.chained_assignment = None #default = 'warn'
df = pd.read_csv('student_records.csv')

In [86]:
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend,Honest
0,Henry,A,y,90,85,Yes,y
1,Jones,C,n,85,51,Yes,n
2,David,F,y,10,17,No,y
3,Hari,E,y,72,80,Yes,y
4,Ram,D,n,20,66,No,n
5,Shyam,C,y,92,35,No,y
6,Prasanna,A,n,87,48,Yes,n
7,Madhav,B,n,50,92,No,n
8,Sita,C,y,65,83,No,y
9,Krishna,D,y,35,40,Yes,y


## Data Preparation
### Feature Extraction and Engineering

In [87]:
feature_names = ['OverallGrade','Honest', 'ResearchScore', 'ProjectScore']
training_features = df[feature_names]

outcome_name = ['Recommend']
outcome_labels = df[outcome_name]
training_features

Unnamed: 0,OverallGrade,Honest,ResearchScore,ProjectScore
0,A,y,90,85
1,C,n,85,51
2,F,y,10,17
3,E,y,72,80
4,D,n,20,66
5,C,y,92,35
6,A,n,87,48
7,B,n,50,92
8,C,y,65,83
9,D,y,35,40


In [88]:
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,Yes
4,No
5,No
6,Yes
7,No
8,No
9,Yes


In [89]:
numeric_feature_names = ['ResearchScore', 'ProjectScore']
categorial_feature_names = ['OverallGrade', 'Honest']

In [90]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

ss.fit(training_features[numeric_feature_names])
training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])
training_features

Unnamed: 0,OverallGrade,Honest,ResearchScore,ProjectScore
0,A,y,1.022437,1.058577
1,C,n,0.848553,-0.364017
2,F,y,-1.759705,-1.786611
3,E,y,0.396455,0.849372
4,D,n,-1.411937,0.263598
5,C,y,1.091991,-1.033473
6,A,n,0.918107,-0.48954
7,B,n,-0.368634,1.351464
8,C,y,0.153018,0.974895
9,D,y,-0.890285,-0.824268


In [91]:
training_features = pd.get_dummies(training_features, columns =categorial_feature_names)
training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_D,OverallGrade_E,OverallGrade_F,Honest_n,Honest_y
0,1.022437,1.058577,1,0,0,0,0,0,0,1
1,0.848553,-0.364017,0,0,1,0,0,0,1,0
2,-1.759705,-1.786611,0,0,0,0,0,1,0,1
3,0.396455,0.849372,0,0,0,0,1,0,0,1
4,-1.411937,0.263598,0,0,0,1,0,0,1,0
5,1.091991,-1.033473,0,0,1,0,0,0,0,1
6,0.918107,-0.48954,1,0,0,0,0,0,1,0
7,-0.368634,1.351464,0,1,0,0,0,0,1,0
8,0.153018,0.974895,0,0,1,0,0,0,0,1
9,-0.890285,-0.824268,0,0,0,1,0,0,0,1


In [92]:
categorical_engineered_features = list(set(training_features.columns)-set(numeric_feature_names))
categorical_engineered_features

['OverallGrade_A',
 'OverallGrade_B',
 'Honest_n',
 'OverallGrade_F',
 'OverallGrade_C',
 'OverallGrade_D',
 'Honest_y',
 'OverallGrade_E']

## Modeling

In [93]:
from sklearn.linear_model import LogisticRegression
import numpy as np

lr = LogisticRegression()
model = lr.fit(training_features, np.array(outcome_labels['Recommend']))
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Model Evaluation

In [94]:
pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_labels['Recommend'])

In [95]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print("Accuracy: ", float(accuracy_score(actual_labels, pred_labels))* 100, '%')
print("Classification Stats:")
print(classification_report(actual_labels, pred_labels))

Accuracy:  80.0 %
Classification Stats:
              precision    recall  f1-score   support

          No       0.80      0.80      0.80         5
         Yes       0.80      0.80      0.80         5

    accuracy                           0.80        10
   macro avg       0.80      0.80      0.80        10
weighted avg       0.80      0.80      0.80        10



## Model Deployment

In [96]:
import joblib
import os
if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Scaler'):
    os.mkdir('Scaler')

In [97]:
joblib.dump(model, r'Model/model.pickle')
joblib.dump(ss, r'Scaler/scaler.pickle')

['Scaler/scaler.pickle']

### Prediction in Action

In [98]:
model = joblib.load(r'Model/model.pickle')
scaler1 = joblib.load(r'Scaler/Scaler.pickle')

In [104]:
new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F', 'Honest': 'n', 'ResearchScore': 30, 'ProjectScore': 20},
{'Name': 'Thomas', 'OverallGrade': 'A', 'Honest': 'y', 'ResearchScore': 78, 'ProjectScore': 80}])
new_data = new_data[['Name', 'OverallGrade', 'Honest', 'ResearchScore', 'ProjectScore']]
new_data

Unnamed: 0,Name,OverallGrade,Honest,ResearchScore,ProjectScore
0,Nathan,F,n,30,20
1,Thomas,A,y,78,80


In [105]:
prediction_features = new_data[feature_names]
prediction_features[numeric_feature_names] = ss.transform(prediction_features[numeric_feature_names])
prediction_features = pd.get_dummies(prediction_features, columns = categorial_feature_names)
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Honest_n,Honest_y
0,-1.064169,-1.661088,0,1,1,0
1,0.605116,0.849372,1,0,0,1


In [106]:
current_categorial_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)
missing_features = set(categorical_engineered_features)- current_categorial_engineered_features
for feature in missing_features:
    prediction_features[feature] = [0] * len(prediction_features)
    
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Honest_n,Honest_y,OverallGrade_D,OverallGrade_E,OverallGrade_C,OverallGrade_B
0,-1.064169,-1.661088,0,1,1,0,0,0,0,0
1,0.605116,0.849372,1,0,0,1,0,0,0,0


In [107]:
predictions = model.predict(prediction_features)
new_data['Recommend'] = predictions
new_data

Unnamed: 0,Name,OverallGrade,Honest,ResearchScore,ProjectScore,Recommend
0,Nathan,F,n,30,20,No
1,Thomas,A,y,78,80,Yes
