In [54]:
import numpy as np
import pandas as pd # type: ignore


In [55]:
df = pd.read_csv('Student-Performance-csv_T4qDx.csv')

In [56]:
df.dtypes

Hours Studied                         int64
Previous Scores                       int64
Extracurricular Activities           object
Sleep Hours                           int64
Sample Question Papers Practiced      int64
Performance Index                   float64
dtype: object

In [57]:
df.head() # as we can see there is a column whose type is not numeric we have to convert it into numeric datatype
# for conversion we are going to use label encoder from sklearn.(unique and fast)

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [58]:
from sklearn.preprocessing import LabelEncoder,StandardScaler


In [59]:
df["Extracurricular Activities"] # convert it into 1,0
level_encoder = LabelEncoder()
newECA = level_encoder.fit_transform(df['Extracurricular Activities'])
df['Extracurricular Activities'] = newECA

In [60]:
df # now the data is ready to perform algorithm you have x1,x2,x3,x4,x4 and corresponding performance idex(y1)


Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,1,4,2,23.0
9996,7,64,1,8,5,58.0
9997,6,83,1,8,5,74.0
9998,9,97,1,7,0,95.0


In [61]:
x = df[["Hours Studied","Previous Scores","Extracurricular Activities",	"Sleep Hours","Sample Question Papers Practiced"]]

In [62]:
x

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,1,9,1
1,4,82,0,4,2
2,8,51,1,7,2
3,5,52,1,5,2
4,7,75,0,8,5
...,...,...,...,...,...
9995,1,49,1,4,2
9996,7,64,1,8,5
9997,6,83,1,8,5
9998,9,97,1,7,0


In [63]:
y = df["Performance Index"]

In [64]:
# once you have x and y, split the data in training data and testing data
from sklearn.model_selection import train_test_split

In [65]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2) # test_size= 0.2 mean 20 percent is testData

In [66]:
x_train # for training the data it is good for transform it in small scale like Z-score
# we are going to use StandardScaler() class

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
1838,6,97,1,9,7
9365,8,94,0,5,9
147,1,62,0,4,4
4333,1,84,0,6,8
9114,7,74,0,9,1
...,...,...,...,...,...
2203,2,49,0,4,8
1104,8,59,0,9,2
5196,4,83,0,8,4
8992,3,98,1,4,3


In [67]:
scaler = StandardScaler()
x_trained_scaled = scaler.fit_transform(x_train) # z = x-u/std 
# 5 column each :Hours Studied	,Previous Scores,Extracurricular Activities, Sleep Hours, Sample Question Papers Practiced


In [68]:
x_test_scaled = scaler.fit_transform(x_test) # z = x-u/std 

In [69]:
# there is inbuilt library linear_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score 
#r2_score is used for measuring accuracy

In [70]:
# now we train in just one line
model = LinearRegression()
model.fit(x_trained_scaled,y_train)

In [71]:
y_pred = model.predict(x_test_scaled) # you can pass with scaling in the small scale
y_pred

array([41.10059624, 59.21376041, 68.67010169, ..., 28.99564831,
       51.79622473, 62.87561082], shape=(2000,))

In [72]:
x_test_scaled

array([[ 1.17329378, -1.42190683,  1.00904087,  1.45824285,  1.55355034],
       [-1.14852769,  0.66913263,  1.00904087,  0.27938848,  0.16303762],
       [-0.76155744,  1.07572363, -0.99104014,  1.45824285, -1.57510329],
       ...,
       [ 0.39935329, -1.59616012, -0.99104014, -0.89946588,  0.16303762],
       [ 0.01238305, -0.08596496, -0.99104014, -1.48889306, -0.87984692],
       [-1.14852769,  0.84338592,  1.00904087,  0.27938848,  1.20592216]],
      shape=(2000, 5))

In [73]:
model.predict([[ 1.13779098, -1.1782377 , -0.96946594,  1.44613782,  1.1685167 ]])

array([44.33377741])

In [74]:
mse = mean_squared_error(y_test,y_pred) 
mse # 4.422 mse that means low error
absError = mean_absolute_error(y_test,y_pred)
absError, mse 

(1.62359978804124, 4.164236167984236)

In [75]:
r2_score(y_test,y_pred) # your model is 98.79 percent accurate

0.9883189132605517

In [78]:
# now our model is stored in a variable , make a file of it if i close the notebook then everything gonna be flush
# pickle library is used for saving the model in .pkl
import pickle
with open("student_lr_final_model.pkl","wb") as file:
        pickle.dump((model,scaler,level_encoder),file) # if you have done some preprocessing , pass that preprocessing to it so that when the user send the data then it first preprocess then the preprocess data goes to our model and based on that it can predict.
