#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [40]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder,PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
import pickle

import warnings

#### Import the CSV Data as Pandas DataFrame

In [41]:
df = pd.read_csv('data/outlier_removed.csv')

#### Show Top 5 Records

In [42]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.67
1,female,group C,some college,standard,completed,69,90,88,247,82.33
2,female,group B,master's degree,standard,none,90,95,93,278,92.67
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.33
4,male,group C,some college,standard,none,76,78,75,229,76.33


#### droping total score columns, As was showing multicollineariy with average

In [43]:
df.drop(columns=['total score'], inplace=True)

#### Preparing X and Y variables

In [44]:
X = df.drop(columns=['average'],axis=1)

In [45]:
y = df['average']

In [46]:
num_features = [feature for feature in X.columns if X[feature].dtype != 'O']
num_features

['math score', 'reading score', 'writing score']

In [47]:
ohe_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course']
ohe_columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course']

In [48]:
scale = ColumnTransformer(transformers=[
    ('scale', StandardScaler(),slice(0,11))
],remainder='passthrough')
tnf = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse=False, drop= 'first'),[0,1,2,3,4])
],remainder='passthrough')

#### Using Linear Regression

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 1.0
MAE 9.452756038237047e-15


#### Using Ridge Regression

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',Ridge(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))
print(" ")

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 1.0000
MAE: 0.0001
 
Model performance for Test set
R2: 1.0000
MAE : 0.0001


#### Using Lasso Regression

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=728)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',Lasso(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)

# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))
print(" ")

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 1.0000
MAE: 0.0600
 
Model performance for Test set
R2: 1.0000
MAE : 0.0601


#### Using Support Vecor Regression

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=480)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',SVR(C=1,kernel='linear',gamma='auto',max_iter=54))
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))
print(" ")

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 1.0000
MAE: 0.0181
 
Model performance for Test set
R2: 1.0000
MAE : 0.0177


#### Using KNeighborsRegressor

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=458)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',KNeighborsRegressor())
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))
print(" ")

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 0.9972
MAE: 0.5027
 
Model performance for Test set
R2: 0.9861
MAE : 0.7432


#### Using DecisionTreeRegressor

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=657)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',DecisionTreeRegressor())
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))
print(" ")

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 1.0000
MAE: 0.0000
 
Model performance for Test set
R2: 0.9764
MAE : 1.5943


#### Using RandomForestRegressor

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=256)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',RandomForestRegressor())
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))
print(" ")

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 0.9983
MAE: 0.3233
 
Model performance for Test set
R2: 0.9940
MAE : 0.7400


####  Save the trained model as a pickle file.

In [58]:
import pickle

pickle.dump(pipe, open('models/model.pkl', 'wb'))

#### Preparing data to create batch prediction

In [59]:
import json
result = X_test.to_json(orient="records")
parsed = json.loads(result)

#### Initiating databse connection

In [74]:
import pymongo
client = pymongo.MongoClient("mongodb+srv://sbadgujar1019:<password>0@cluster0.nkgkcn8.mongodb.net/") #create your own and change password

#### Checking database name

In [75]:
db = client.batch_data
print(db)

Database(MongoClient(host=['ac-r0u5u1n-shard-00-01.nkgkcn8.mongodb.net:27017', 'ac-r0u5u1n-shard-00-02.nkgkcn8.mongodb.net:27017', 'ac-r0u5u1n-shard-00-00.nkgkcn8.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-848xj7-shard-0', tls=True), 'batch_data')


#### Creating student batch data collection

In [76]:
coll = db['student performance']

In [77]:
db.list_collection_names()

[]

#### Inserting all records in database

In [78]:
coll.insert_many(parsed)

InsertManyResult([ObjectId('65ec3171c7508c3663498cc9'), ObjectId('65ec3171c7508c3663498cca'), ObjectId('65ec3171c7508c3663498ccb'), ObjectId('65ec3171c7508c3663498ccc'), ObjectId('65ec3171c7508c3663498ccd'), ObjectId('65ec3171c7508c3663498cce'), ObjectId('65ec3171c7508c3663498ccf'), ObjectId('65ec3171c7508c3663498cd0'), ObjectId('65ec3171c7508c3663498cd1'), ObjectId('65ec3171c7508c3663498cd2'), ObjectId('65ec3171c7508c3663498cd3'), ObjectId('65ec3171c7508c3663498cd4'), ObjectId('65ec3171c7508c3663498cd5'), ObjectId('65ec3171c7508c3663498cd6'), ObjectId('65ec3171c7508c3663498cd7'), ObjectId('65ec3171c7508c3663498cd8'), ObjectId('65ec3171c7508c3663498cd9'), ObjectId('65ec3171c7508c3663498cda'), ObjectId('65ec3171c7508c3663498cdb'), ObjectId('65ec3171c7508c3663498cdc'), ObjectId('65ec3171c7508c3663498cdd'), ObjectId('65ec3171c7508c3663498cde'), ObjectId('65ec3171c7508c3663498cdf'), ObjectId('65ec3171c7508c3663498ce0'), ObjectId('65ec3171c7508c3663498ce1'), ObjectId('65ec3171c7508c3663498c