In [5]:
import pandas as pd

df = pd.read_csv('./data/student/student-mat.csv', sep=';')
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [6]:
'''
1. Perform regression using different machine learning models and compare the results on the testing data. 
'''

# 1.1. Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = df[['G1', 'G2']]
y = df['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('Linear Regression')
print('MSE: ', mean_squared_error(y_test, y_pred))

# 1.2. Decision Tree
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print('Decision Tree')
print('MSE: ', mean_squared_error(y_test, y_pred))

# 1.3. svm regression
from sklearn.svm import SVR 

svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

print('Support vector Regression')
print('MSE', mean_squared_error(y_test, y_pred))

Linear Regression
MSE:  2.2476304555298463
Decision Tree
MSE:  3.231066149733501
Support vector Regression
MSE 2.4086417869551293


In [7]:

'''
2. Try all means to increase/decrease the model complexity and study the training time and model accuracies for the training and testing data. 
'''

from sklearn.svm import SVR

svr_linear = SVR(kernel='linear')
svr_linear.fit(X_train, y_train)
y_pred = svr.predict(X_test)

print('Support vector Regression with linear kernel')
print('mean squared error', mean_squared_error(y_test, y_pred))

svr_poly = SVR(kernel='poly', degree=3)
svr_poly.fit(X_train, y_train)
y_pred = svr_poly.predict(X_test)

print('Support vector Regression with polynomial kernel')
print('mean squared error', mean_squared_error(y_test, y_pred))

svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)
y_pred = svr_rbf.predict(X_test)

print('Support vector Regression with rbf kernel')
print('mean squared error', mean_squared_error(y_test, y_pred))

svm_sigmoid = SVR(kernel='sigmoid')
svm_sigmoid.fit(X_train, y_train)
y_pred = svm_sigmoid.predict(X_test)

print('Support vector Regression with sigmoid kernel')
print('mean squared error', mean_squared_error(y_test, y_pred))


Support vector Regression with linear kernel
mean squared error 2.4086417869551293
Support vector Regression with polynomial kernel
mean squared error 5.022753532327914
Support vector Regression with rbf kernel
mean squared error 2.4086417869551293
Support vector Regression with sigmoid kernel
mean squared error 30.159943084292703


In [11]:
'''
3. Augment the training data by adding noise to the training data to create new noisy samples and observe the effect of the error on the test data. Equivalently, augment the test data by adding noise and note the change in the average error. 
'''
import numpy as np

X_train = X_train + np.random.normal(0, 0.1, X_train.shape)

svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)
y_pred = svr_rbf.predict(X_test)

print('Support vector Regression with rbf kernel')
print('mean squared error', mean_squared_error(y_test, y_pred))


Support vector Regression with rbf kernel
mean squared error 2.429756801065798


In [12]:
'''
4. Show the change in errors for different numbers of input attributes.  
Show the change in errors for different numbers of input data size.  
'''

from sklearn.metrics import r2_score

data_size_results = []

train_sizes = np.linspace(0.1, 1.0, 10)

for train_size in train_sizes:
    num_samples = int(train_size * X_train.shape[0])
    X_train_subset = X_train[:num_samples]
    y_train_subset = y_train[:num_samples]

    lr_model_subset = LinearRegression()
    lr_model_subset.fit(X_train_subset, y_train_subset)

    y_test_pred_full = lr_model_subset.predict(X_test)
    mse_full = mean_squared_error(y_test, y_test_pred_full)
    r2_full = r2_score(y_test, y_test_pred_full)

    data_size_results.append((num_samples, mse_full, r2_full))

data_size_results_df = pd.DataFrame(data_size_results, columns=['Training Samples', 'MSE', 'R^2'])

data_size_results_df

Unnamed: 0,Training Samples,MSE,R^2
0,31,2.743227,0.859156
1,63,2.38496,0.87755
2,94,2.366014,0.878523
3,126,2.270129,0.883446
4,158,2.238462,0.885072
5,189,2.226272,0.885697
6,221,2.216206,0.886214
7,252,2.228291,0.885594
8,284,2.252516,0.88435
9,316,2.25049,0.884454
