# Introduction to Machine Learning - Task 1a

Group name: Cbbayes

Team members: mcolomer (mcolomer@student.ethz.ch), pratsink (pratsink@student.ethz.ch) and scastro (scastro@student.ethz.ch)

Fall 2021

## Library import

In [27]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import numpy as np

## Opening the data

In [28]:
path = "data/" #Path to the data folder
filename = "train.csv" #Train data file name
train_data = pd.read_csv(path+filename)
X = train_data.iloc[:, 1:14].values 
y = train_data.iloc[:,0].values 

## k-fold crossvalidation and ridge regression


In [34]:
def ridge_regression(X_train, y_train, X_test, alpha):
    """
    Performs the ridge regerssion analysis given a training and test
    dataset
    :X_train: training class features
    :y_train: training class labels
    :X_test: test class features
    :alpha: strength of the ridge regerssion
    """
    #Perform the Logistic Regression clasification
    clf = Ridge(alpha=alpha)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred


def cross_validation_analysis(X, y, n_kfold , alpha):
    """
    Performs the k-fold cross-validation analysis of
    the ridge regression
    :X: class features
    :y: class labels
    :alpha: strength of the ridge regerssion
    :n_kfold: number of splits of the k-fold crossvalidation """
    cv = KFold(n_splits=n_kfold)
    errors = []
    for train_index, test_index in cv.split(X, y):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        y_pred = ridge_regression(X_train, y_train, X_test, alpha)
        RSME = np.sqrt(mean_squared_error(y_test, y_pred))
        #RSME2 = np.sqrt(np.sum((y_test-y_pred)**2)/len(y_test))
        errors.append(RSME)
    print(np.mean(errors))
    return np.mean(errors)


def cross_validation_analysis(X, y, n_kfold , alpha_parameters):
    """
    Performs the k-fold cross-validation analysis of
    the ridge regression
    :X: class features
    :y: class labels
    :alpha: strength of the ridge regerssion
    :n_kfold: number of splits of the k-fold crossvalidation """
    cv = KFold(n_splits=n_kfold)
    errors = []
    for i in range(len(alpha_parameters)):
        errors.append([])
    for train_index, test_index in cv.split(X, y):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        for i in range(len(alpha_parameters)):
            y_pred = ridge_regression(X_train, y_train, X_test, alpha_parameters[i])
            RSME = np.sqrt(mean_squared_error(y_test, y_pred))
            errors[i].append(RSME)
    mean_errors = np.mean(errors, axis=1)
    return mean_errors

## analyse the data

In [35]:
#Ridge strengths and random seed
alpha_parameters = [0.1, 1, 10, 100, 200]
#np.random.seed(10)

RSME_alphas = cross_validation_analysis(X, y, 10 , alpha_parameters)

#Iterate for the different ridge strengths

    
#Ouptut file
np.savetxt('output.csv', RSME_alphas, fmt='%2.1f')


In [36]:
RSME_alphas

array([5.50180945, 5.49983874, 5.48363149, 5.63664214, 5.72123372])