## Evaluate complete data

In this file, we calculate the evaluation error metrics for the complete datasets. 

In [11]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import explained_variance_score as ev
from math import sqrt

In [12]:
k_fold = KFold(n_splits=5, random_state=None)
lin = LinearRegression()
def perform_crossvalidation(df, output_variable):
    results_mse = []
    results_rmse = []
    results_ev = []
    results_mae = []
    results_dif = []
    for train_indices, cv_indices in k_fold.split(df):
        
        X_train = df.drop(output_variable, 1).iloc[train_indices]
        y_train = df[output_variable].iloc[train_indices]
        X_val = df.drop(output_variable, 1).iloc[cv_indices]
        y_val = df[output_variable].iloc[cv_indices]
        
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = pd.DataFrame(data=scaler.transform(X_train), columns=X_train.columns) 
        X_val = pd.DataFrame(data=scaler.transform(X_val), columns=X_val.columns)
        
        lin.fit(X_train, y_train)
        pred = lin.predict(X_val)
        results_mse.append(mse(y_val,pred))
        results_rmse.append(sqrt(mse(y_val,pred)))
        results_ev.append(ev(y_val,pred))
        results_mae.append(mae(y_val,pred))
        results_dif.append(mse(y_train,lin.predict(X_train))-mse(y_val,pred))
    
    print('mse: {}, rmse: {}, ev: {}, mae: {}, dif: {}'.format(sum(results_mse) / len(results_mse),
                                                      sum(results_rmse) / len(results_rmse),
                                                      sum(results_ev) / len(results_ev),
                                                      sum(results_mae) / len(results_mae),
                                                      sum(results_dif) / len(results_dif)))

In [13]:
df_ff = pd.read_csv('Data/forest_fires.txt',sep='\t')
perform_crossvalidation(df_ff, 'area')

mse: 4160.6025751093675, rmse: 50.39409776536349, ev: -0.785222922286694, mae: 21.47722507742217


In [14]:
df_st = pd.read_csv('Data/slump_test.txt',sep='\t')
perform_crossvalidation(df_st, 'SLUMP(cm)')

mse: 62.468300175745206, rmse: 7.841669613615873, ev: 0.14597547928759338, mae: 6.482574773050887


In [None]:
df_rwq = pd.read_csv('Data/red_wine_quality.txt',sep='\t')
perform_crossvalidation(df_rwq, 'quality')

In [None]:
df_sac = pd.read_csv('Data/school_alcohol_consumption.txt',sep='\t')
perform_crossvalidation(df_sac, 'Dalc')

In [16]:
df = pd.read_csv('Data/custom_dataset_poor_little.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 0.010110726381626738, rmse: 0.10055144357437054, ev: 0.9999858989122185, mae: 0.08061058384105854


In [17]:
df = pd.read_csv('Data/custom_dataset_poor_much.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 24.602855969289422, rmse: 4.959966904760806, ev: 0.99176344658143, mae: 3.9603103483557036


In [18]:
df = pd.read_csv('Data/custom_dataset_rich_little.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 0.00978710161916523, rmse: 0.09891211936593731, ev: 0.9999999999985677, mae: 0.07904486715523447


In [19]:
df = pd.read_csv('Data/custom_dataset_rich_much.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 24.866799186237536, rmse: 4.986148967423025, ev: 0.9999996356182048, mae: 3.9608002240642954
