In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os #importing necessary liberaries

In [None]:
path = kagglehub.dataset_download("nikhil7280/student-performance-multiple-linear-regression")

print("Path to dataset files:", path)
print(os.listdir(path))

Path to dataset files: /kaggle/input/student-performance-multiple-linear-regression
['Student_Performance.csv']


In [None]:
df=pd.read_csv('/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv')

In [None]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [None]:
df.isnull().sum() #no null values

Unnamed: 0,0
Hours Studied,0
Previous Scores,0
Extracurricular Activities,0
Sleep Hours,0
Sample Question Papers Practiced,0
Performance Index,0


In [None]:
df.duplicated().sum() #127 duplicated values

np.int64(127)

In [None]:
df.shape #knowing the shape so 127 dosent make that much impact if droped

(10000, 6)

In [None]:
df.drop_duplicates(inplace=True) #droping duplicate values

In [None]:
df.shape

(9873, 6)

In [None]:
df.dtypes #knowing the datatypes it needs to be in float or int

Unnamed: 0,0
Hours Studied,int64
Previous Scores,int64
Extracurricular Activities,object
Sleep Hours,int64
Sample Question Papers Practiced,int64
Performance Index,float64


In [None]:
df['Extracurricular Activities'].unique() #one col is categorical data so we map it to 1,0

array(['Yes', 'No'], dtype=object)

In [None]:
df['Extracurricular Activities'] = df['Extracurricular Activities'].map({'Yes':1,'No':0}) #maping

In [None]:
df['Extracurricular Activities'] = df['Extracurricular Activities'].astype(int) #now coverting to int

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score #sklearn libraries for train test split , lr, and r2 score

In [None]:
x = df.drop(columns=['Performance Index'])
y = df['Performance Index']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2) #splited

In [None]:
LR = LinearRegression()

In [None]:
LR.fit(x_train,y_train)

In [None]:
predictions = LR.predict(x_test)

In [None]:
r2 = r2_score(y_test,predictions)
print("R2 score is ", r2) #amazing score unfortunatily we cannot visualize

R2 score is  0.9894961318217909


In [None]:
print("Adjacent R2 Error", (1 - ((1-r2)*(x_test.shape[0] -1)) / (x_test.shape[0]-1-1)))

Adjacent R2 Error 0.989490808016328


In [None]:
LR.coef_ #coefficient of Beta values

array([2.84941335, 1.0180357 , 0.60411544, 0.48164095, 0.19413131])

In [None]:
LR.intercept_ #intercept or Beta0

np.float64(-34.04218455349236)

<h1> Using custom made MLR </h1>

In [None]:
class mylr: # a class created by using my name
  def _init_(self): #initializing the coef and intercept in this function
    self.coef = None
    self.intercept = None
  def fit(self,x_train,y_train): #function to pass data to model for calculation
    x_train = np.insert(x_train,0,1,axis=1) #using npinsert we insert 1 at first column for Beta0

    #calculation using formula which picture will be attached
    betas = np.linalg.inv(x_train.T.dot(x_train)).dot(x_train.T).dot(y_train) #formula for betas
    self.intercept = betas[0] #took out beta 0 which will be common all across and is intercept
    self.coef = betas[1:] #further then 1 all are coeff so given here

  def predict(self,x_test):
    y_pred = np.dot(x_test,self.coef) + self.intercept #doing dot product of every value with relevent coeficient or beta and adding into intercept to get the answer
    return y_pred


In [None]:
lrs = mylr()

In [None]:
lrs.fit(x_train,y_train)

In [None]:
predictionsm = lrs.predict(x_test)

In [None]:
r22 = r2_score(y_test,predictionsm)
print("R2 score from Coded model ", r22) #score came out to be same as sklearn library so it proves same is being used inside of sklearn library
print("R2 score from Sklearn model ", r2)
#both turn out to be same

R2 score from Coded model  0.9894961318217907
R2 score from Sklearn model  0.9894961318217909


In [None]:
print("Adjacent R2 Error from SKLearn Model", (1 - ((1-r2)*(x_test.shape[0] -1)) / (x_test.shape[0]-1-1)))
print("Adjacent R2 Error from Coded Model Object", (1 - ((1-r22)*(x_test.shape[0] -1)) / (x_test.shape[0]-1-1)))

Adjacent R2 Error from SKLearn Model 0.989490808016328
Adjacent R2 Error from Coded Model Object 0.9894908080163279


In [None]:
print("Coefficient of SKlearn Model", LR.coef_)
print("Coefficient of Coded Model", lrs.coef) #same

Coefficient of SKlearn Model [2.84941335 1.0180357  0.60411544 0.48164095 0.19413131]
Coefficient of Coded Model [2.84941335 1.0180357  0.60411544 0.48164095 0.19413131]


In [None]:
print("Intercept of SKlearn Model", LR.intercept_)
print("Intercept of Coded Model", lrs.intercept) #same and proves that Sklearn also use the OLS method

Intercept of SKlearn Model -34.04218455349236
Intercept of Coded Model -34.04218455349268
