# Linear Regression

### Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

### Data Processing

In [17]:
data = pd.read_csv("student-mat.csv", sep=";", usecols=['age', 'Medu', 'Fedu', 'freetime', 'health', 'absences', 'guardian', 'G1', 'G2', 'G3'])

In [18]:
data

Unnamed: 0,age,Medu,Fedu,guardian,freetime,health,absences,G1,G2,G3
0,18,4,4,mother,3,3,6,5,6,6
1,17,1,1,father,3,3,4,5,5,6
2,15,1,1,mother,3,3,10,7,8,10
3,15,4,2,mother,2,5,2,15,14,15
4,16,3,3,father,3,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...
390,20,2,2,other,5,4,11,9,9,9
391,17,3,1,mother,4,2,3,14,16,16
392,21,1,1,other,5,3,3,10,8,7
393,18,3,2,mother,4,5,0,11,12,10


In [23]:
data_encoded = pd.get_dummies(data, columns=['guardian'], prefix='guardian', drop_first=True)

In [24]:
data_encoded

Unnamed: 0,age,Medu,Fedu,freetime,health,absences,G1,G2,G3,guardian_mother,guardian_other
0,18,4,4,3,3,6,5,6,6,True,False
1,17,1,1,3,3,4,5,5,6,False,False
2,15,1,1,3,3,10,7,8,10,True,False
3,15,4,2,2,5,2,15,14,15,True,False
4,16,3,3,3,5,4,6,10,10,False,False
...,...,...,...,...,...,...,...,...,...,...,...
390,20,2,2,5,4,11,9,9,9,False,True
391,17,3,1,4,2,3,14,16,16,True,False
392,21,1,1,5,3,3,10,8,7,False,True
393,18,3,2,4,5,0,11,12,10,True,False


In [25]:
data_encoded = data_encoded.astype(int)

In [26]:
data_encoded

Unnamed: 0,age,Medu,Fedu,freetime,health,absences,G1,G2,G3,guardian_mother,guardian_other
0,18,4,4,3,3,6,5,6,6,1,0
1,17,1,1,3,3,4,5,5,6,0,0
2,15,1,1,3,3,10,7,8,10,1,0
3,15,4,2,2,5,2,15,14,15,1,0
4,16,3,3,3,5,4,6,10,10,0,0
...,...,...,...,...,...,...,...,...,...,...,...
390,20,2,2,5,4,11,9,9,9,0,1
391,17,3,1,4,2,3,14,16,16,1,0
392,21,1,1,5,3,3,10,8,7,0,1
393,18,3,2,4,5,0,11,12,10,1,0


In [4]:
X, y = np.array(data.drop(['G3'], axis=1)), np.array(data['G3'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
model = LinearRegression()
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print(acc)

0.7977542373906126


In [10]:
prediction = model.predict(X_test)

In [11]:
print(model.coef_)
print(model.intercept_)

[-0.13849376  0.09521323 -0.10656259 -0.00595075  0.03967904  0.03044606
  0.14513003  0.97715622]
0.4837244533633047


In [12]:
y_mean = sum(y_test) / len(y_test)

# Step 2: residual sum of squares (SS_res)
ss_res = 0
for yi, yhat in zip(y_test, prediction):
    ss_res += (yi - yhat) ** 2

# Step 3: total sum of squares (SS_tot)
ss_tot = 0
for yi in y_test:
    ss_tot += (yi - y_mean) ** 2

# Step 4: R² score
r2 = 1 - (ss_res / ss_tot)

print("R² score:", r2)

R² score: 0.7977542373906126
