In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
students_data = pd.read_csv('Student_Performance.csv')

In [3]:
students_data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [4]:
students_data.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [5]:
students_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [6]:
students_data.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

### No NaN values in dataset.

## Lets encode the Extracurricular Activities feature since it is categorial.

In [7]:
encoder = OneHotEncoder(sparse_output=False, drop='first')
col_data = students_data[['Extracurricular Activities']]
col_data

Unnamed: 0,Extracurricular Activities
0,Yes
1,No
2,Yes
3,Yes
4,No
...,...
9995,Yes
9996,Yes
9997,Yes
9998,Yes


In [8]:
encoded_col = encoder.fit_transform(col_data)
new_col_names = encoder.get_feature_names_out(['Extracurricular Activities'])
encoded_col_df = pd.DataFrame(encoded_col, columns = new_col_names, index = students_data.index)
students_data = pd.concat([students_data, encoded_col_df], axis=1)

In [9]:
X = students_data.drop(columns = ['Performance Index', 'Extracurricular Activities'])
Y = students_data['Performance Index']

In [10]:
X.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Extracurricular Activities_Yes
0,7,99,9,1,1.0
1,4,82,4,2,0.0
2,8,51,7,2,1.0
3,5,52,5,2,1.0
4,7,75,8,5,0.0


### Splitting the data into test and train sets.

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

### Scaling data.

In [12]:
scaler = StandardScaler()

In [13]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

## Making my own gradient descent regressor for multi-linear regression.

In [14]:
class GDR:
    def __init__(self, lr, epochs):
        self.m = None
        self.b = None
        self.lr = lr
        self.epochs = epochs
        
    def fit(self, X, Y):
        N_samples, N_features = X.shape
        self.m = np.zeros(N_features)
        self.b = 0
        for i in range(self.epochs):
            Y_preds = np.dot(X, self.m) + self.b
            b_slope_loss = -(2 / N_samples) * sum(Y - Y_preds)
            m_slope_loss = -(2 / N_samples) * np.dot((Y - Y_preds), X)

            self.b -= self.lr * b_slope_loss
            self.m -= self.lr * m_slope_loss
        print(self.m,self.b)
        
    def predict(self,X):
        return np.dot(X, self.m) + self.b

    def r2_score(self, X, Y):
        Y_preds = self.predict(X)
        ss_res = np.sum((Y - Y_preds) ** 2)
        ss_tot = np.sum((Y - np.mean(Y)) ** 2)
        r2_score = 1 - (ss_res / ss_tot)
        return r2_score
        

In [15]:
gdr = GDR(0.001, 10000)

In [16]:
gdr.fit(X_train_scaled, Y_train)

[ 7.40134077 17.63727123  0.81003107  0.54884173  0.30429076] 55.31149988825492


In [17]:
Y_preds = gdr.predict(X_test_scaled)

### Using self-built r2 score to score the prediction.

In [18]:
print(gdr.r2_score(X_test_scaled, Y_test))

0.9884832863801489
