## Data Retrieval

In [30]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [31]:
df=pd.read_csv('student_records.csv')

In [32]:
pd.options.mode.chained_assignment=None #default warn

In [33]:
df.shape

(8, 6)

## Data preparation

In [34]:
# get features and corresponding outcomes
feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']
training_features = df[feature_names]

outcome_name = ['Recommend']
outcome_labels = df[outcome_name]

In [35]:
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33


In [36]:
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [37]:
#list down features with there characterstics onbase type of data
numeric_feature_names=['ResearchScore','ProjectScore']
categorical_feature_names=['OverallGrade','Obedient']

## Feature Scaling for Numeric Data

In [38]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()

#training scaler on num feature
ss.fit(training_features[numeric_feature_names] )

#Scale numeric Features after this,
training_features[numeric_feature_names]=ss.transform(training_features[numeric_feature_names])

#lets view the data
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


## Feature Engineering for Categorical Data

In [39]:
training_features=pd.get_dummies(training_features,columns=categorical_feature_names)
#lets see the data
training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [40]:
#get dummies have made rows in coulmns and if person is persent it shows 1 else shows 0

## Modeling Phase

In [41]:
from sklearn.linear_model import LogisticRegression
import numpy as np

### Training the model

In [42]:
lr=LogisticRegression()
model=lr.fit(training_features,np.array(outcome_labels['Recommend']))
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Evaluating the Newly Training Model

In [46]:
pred_labels=model.predict(training_features)
actual_labels=np.array(outcome_labels['Recommend'])

In [47]:
#evaluate model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print('Accuracy',float(accuracy_score(actual_labels,pred_labels)))

Accuracy 1.0


In [48]:
print('classification stats  for it')
print(classification_report(actual_labels,pred_labels))

classification stats  for it
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         3

   micro avg       1.00      1.00      1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



## Doing it right with Train,test and split

Lets join the dataset for making a  single df for the same

In [49]:
dfnew=pd.concat([training_features,outcome_labels],join_axes=[df.index],axis=1)
dfnew.head()

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y,Recommend
0,0.899583,1.37665,1,0,0,0,0,0,1,Yes
1,0.730648,-0.091777,0,0,1,0,0,1,0,Yes
2,-1.80339,-1.560203,0,0,0,0,1,1,0,No
3,0.392776,0.772004,0,1,0,0,0,0,1,No
4,-1.465519,-0.998746,0,0,0,1,0,1,0,No


In [50]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dfnew.drop('Recommend',axis=1),dfnew['Recommend'],test_size=0.20,random_state=101)

In [51]:
y_train.head()

5    Yes
4     No
6     No
1    Yes
3     No
Name: Recommend, dtype: object

## Modeling Phase with new DataFrame

In [52]:
from sklearn.linear_model import LogisticRegression
import numpy as np

training the model

In [53]:
lr=LogisticRegression()
model2=lr.fit(X_train,np.array(y_train))
model2

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Evaluating the newly trained model`

In [54]:
pred_labels=model2.predict(X_test)
actual_labels=np.array(y_test)

In [55]:
#evaluate model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print('Accuracy',float(accuracy_score(actual_labels,pred_labels)))

Accuracy 1.0


In [56]:
print('classification stats  for it')
print(classification_report(actual_labels,pred_labels))

classification stats  for it
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         1
         Yes       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



## Model Deployment

In [57]:
from sklearn.externals import joblib
import os
if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Scaler'):
    os.mkdir('Scaler')
joblib.dump(model2, r'Model/model.pickle')
joblib.dump(ss, r'Scaler/scaler.pickle')



['Scaler/scaler.pickle']

pickeling is knwn as object serialization

## Lets Setup the Deployment Enviormnent and Test It

In [59]:
#load the objects of the model back into ram
model=joblib.load(r'Model/model.pickle')
scaler=joblib.load(r'Scaler/scaler.pickle')

In [60]:
model,scaler

(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='warn',
           tol=0.0001, verbose=0, warm_start=False),
 StandardScaler(copy=True, with_mean=True, with_std=True))

In [63]:
## data retrieval
new_data = pd.DataFrame([{'Name': 'Praffulla',
                          'OverallGrade': 'A',
                          'Obedient': 'Y', 
                          'ResearchScore': 80, 
                          'ProjectScore': 100},
                  {'Name': 'Vishwesh', 
                   'OverallGrade': 'A', 
                   'Obedient': 'Y', 
                   'ResearchScore': 78, 
                   'ProjectScore': 80}])
new_data = new_data[['Name', 'OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']]
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Praffulla,A,Y,80,100
1,Vishwesh,A,Y,78,80


In [64]:
#data prepraration
prediction_features=new_data[feature_names]
#scaling
prediction_features[numeric_feature_names]=scaler.transform(prediction_features[numeric_feature_names])
# engineering categorical variables
prediction_features = pd.get_dummies(prediction_features, columns=categorical_feature_names)

# view feature set
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,Obedient_Y
0,0.561712,2.024485,1,1
1,0.494137,1.160705,1,1


In [65]:
# get list of new categorical features
categorical_engineered_features = list(set(feature_names) - set(numeric_feature_names))

In [66]:
# add missing categorical feature columns
current_categorical_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)
missing_features = set(categorical_engineered_features) - current_categorical_engineered_features
for feature in missing_features:
    # add zeros since feature is absent in these data samples
    prediction_features[feature] = [0] * len(prediction_features) 

# view final feature set
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,Obedient_Y,Obedient,OverallGrade
0,0.561712,2.024485,1,1,0,0
1,0.494137,1.160705,1,1,0,0
