![image info](https://ineuron.ai/images/ineuron-logo.png)

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder,PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
import pickle

import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/cleaned.csv')

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


#### droping total score columns, As was showing multicollineariy with average

In [4]:
df.drop(columns=['total score'], inplace=True)

#### Preparing X and Y variables

In [5]:
X = df.drop(columns=['average'],axis=1)

In [6]:
y = df['average']

In [7]:
num_features = [feature for feature in X.columns if X[feature].dtype != 'O']
num_features

['math score', 'reading score', 'writing score']

In [8]:
ohe_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course']
ohe_columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course']

In [51]:
scale = ColumnTransformer(transformers=[
    ('scale', StandardScaler(),slice(0,11))
],remainder='passthrough')
tnf = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse=False, drop= 'first'),[0,1,2,3,4])
],remainder='passthrough')

#### Using Linear Regression

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 1.0
MAE 7.917476198469688e-15


#### Using Ridge Regression

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',Ridge(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 1.0000
MAE: 0.0001
Model performance for Test set
R2: 1.0000
MAE : 0.0001


#### Using Lasso Regression

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=728)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',Lasso(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)

# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 1.0000
MAE: 0.0600
Model performance for Test set
R2: 1.0000
MAE : 0.0601


#### Using Support Vecor Regression

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=480)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',SVR(C=1,kernel='linear',gamma='auto',max_iter=5e4))
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 1.0000
MAE: 0.0363
Model performance for Test set
R2: 1.0000
MAE : 0.0345


#### Using KNeighborsRegressor

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=458)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',KNeighborsRegressor())
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 0.9972
MAE: 0.5027
Model performance for Test set
R2: 0.9861
MAE : 0.7432


#### Using DecisionTreeRegressor

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=657)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',DecisionTreeRegressor())
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 1.0000
MAE: 0.0000
Model performance for Test set
R2: 0.9756
MAE : 1.6524


#### Using RandomForestRegressor

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=256)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',RandomForestRegressor())
    ])
pipe.fit(X_train, y_train)
# Make predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred) # Calculate r2score
model_train_mae = mean_absolute_error(y_train, y_train_pred) # Calculate MAE

# Test set performance
model_test_r2score = r2_score(y_test, y_test_pred) # Calculate r2score
model_test_mae = mean_absolute_error(y_test, y_test_pred) # Calculate MAE

  
print('Model performance for Training set')
print("R2: {:.4f}".format(model_train_r2score))
print("MAE: {:.4f}".format(model_train_mae))

print('Model performance for Test set')
print("R2: {:.4f}".format(model_test_r2score))
print("MAE : {:.4f}".format(model_test_mae))

Model performance for Training set
R2: 0.9985
MAE: 0.3127
Model performance for Test set
R2: 0.9940
MAE : 0.7531


####  Save the trained model as a pickle file.

In [22]:
import pickle
pickle.dump(pipe, open('models/model.pkl', 'wb'))

#### Preparing data to create batch prediction

In [32]:
import json
result = X_test.to_json(orient="records")
parsed = json.loads(result)

#### Initiating databse connection

In [24]:
import pymongo
client = pymongo.MongoClient("mongodb+srv://mongodb:mongodb@cluster0.oxgpt.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")

#### Checking database name

In [25]:
db = client.batch_data
print(db)

Database(MongoClient(host=['cluster0-shard-00-01.oxgpt.mongodb.net:27017', 'cluster0-shard-00-00.oxgpt.mongodb.net:27017', 'cluster0-shard-00-02.oxgpt.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-th8ou3-shard-0', tls=True), 'batch_data')


#### Creating student batch data collection

In [28]:
coll = db['student_batch_data']

In [30]:
db.list_collection_names()

['hear_batch_data',
 'classification_batch',
 'regression_batch_data',
 'regression_batch',
 'student_batch_data']

#### Inserting all records in database

In [33]:
coll.insert_many(parsed)

<pymongo.results.InsertManyResult at 0x1bfa3b23fa0>