## Linear Regression

This notebook is a walk-through of Linear Regression usig the **Student Performance** dataset from [UCI-ML repository](https://archive.ics.uci.edu/ml/datasets/Student+Performance). The data attributes include student grades, demographic, social and school related features) and it was collected by using school reports and questionnaires.

*This notebook is in accompaniment with the article 

The notebook will 

In [1]:
import pandas as pd

In [2]:
df_raw = pd.read_csv("Data/student-mat.csv",sep=';')

In [3]:
df_raw.head().T

Unnamed: 0,0,1,2,3,4
school,GP,GP,GP,GP,GP
sex,F,F,F,F,F
age,18,17,15,15,16
address,U,U,U,U,U
famsize,GT3,GT3,LE3,GT3,GT3
Pstatus,A,T,T,T,T
Medu,4,1,1,4,3
Fedu,4,1,1,2,3
Mjob,at_home,at_home,at_home,health,other
Fjob,teacher,other,other,services,other


In [6]:
df_raw.columns.tolist()

['school',
 'sex',
 'age',
 'address',
 'famsize',
 'Pstatus',
 'Medu',
 'Fedu',
 'Mjob',
 'Fjob',
 'reason',
 'guardian',
 'traveltime',
 'studytime',
 'failures',
 'schoolsup',
 'famsup',
 'paid',
 'activities',
 'nursery',
 'higher',
 'internet',
 'romantic',
 'famrel',
 'freetime',
 'goout',
 'Dalc',
 'Walc',
 'health',
 'absences',
 'G1',
 'G2',
 'G3']

In [7]:
df_raw.dtypes

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

In [6]:
df_transformed = pd.get_dummies(df_raw, drop_first=True)

In [7]:
df_transformed.head().T

Unnamed: 0,0,1,2,3,4
age,18,17,15,15,16
Medu,4,1,1,4,3
Fedu,4,1,1,2,3
traveltime,2,1,1,1,1
studytime,2,2,2,3,2
failures,0,0,3,0,0
famrel,4,5,4,3,4
freetime,3,3,3,2,3
goout,4,3,2,2,2
Dalc,1,1,2,1,1


In [8]:
ordinal_features = ['Medu', 'Fedu', 'traveltime', 'studytime', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health']

In [9]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()

df_ord_enc = df_transformed.copy()
df_ord_enc[ordinal_features] = ord_enc.fit_transform(df_transformed[ordinal_features])

In [10]:
df_ord_enc.head().T

Unnamed: 0,0,1,2,3,4
age,18.0,17.0,15.0,15.0,16.0
Medu,4.0,1.0,1.0,4.0,3.0
Fedu,4.0,1.0,1.0,2.0,3.0
traveltime,1.0,0.0,0.0,0.0,0.0
studytime,1.0,1.0,1.0,2.0,1.0
failures,0.0,0.0,3.0,0.0,0.0
famrel,3.0,4.0,3.0,2.0,3.0
freetime,2.0,2.0,2.0,1.0,2.0
goout,3.0,2.0,1.0,1.0,1.0
Dalc,0.0,0.0,1.0,0.0,0.0


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_ord_enc.drop(columns=['G3']), df_ord_enc.G3, test_size=0.1)

In [14]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
y_pred = lin_reg.predict(X_test)

In [16]:
r2_score = lin_reg.score(X_test, y_test)
print(r2_score)

0.781236071059925


In [53]:
from sklearn.metrics import r2_score

In [54]:
r2_score(y_test, y_pred)

0.789801541140167

In [55]:
from sklearn.metrics import mean_squared_error

In [56]:
mean_squared_error(y_test, y_pred)

3.5187222013136035

In [17]:
from sklearn import metrics

In [20]:
metrics.mean_absolute_error(y_test, y_pred)

1.286392888192313