## Evaluate complete data

In this file, we calculate the evaluation error metrics for the complete datasets. 

In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as ev

In [8]:
k_fold = KFold(n_splits=5, random_state=None)
lin = LinearRegression()
def perform_crossvalidation(df, output_variable):
    results_mse = []
    results_ev = []
    results_mae = []
    for train_indices, cv_indices in k_fold.split(df):
        lin.fit(df.drop(output_variable, 1).iloc[train_indices], df[output_variable].iloc[train_indices])
        pred = lin.predict(df.drop(output_variable, 1).iloc[cv_indices])
        results_mse.append(mse(df[output_variable].iloc[cv_indices],pred))
        results_ev.append(ev(df[output_variable].iloc[cv_indices],pred))
        results_mae.append(mae(df[output_variable].iloc[cv_indices],pred))
    
    print('mse: {}, r2: {}, mae: {}'.format(sum(results_mse) / len(results_mse),
                                            sum(results_ev) / len(results_ev),
                                            sum(results_mae) / len(results_mae)))

In [9]:
df_ff = pd.read_csv('Data/forest_fires.txt',sep='\t')
perform_crossvalidation(df_ff, 'area')

mse: 4160.6025751093675, r2: -1.8710414674700513, mae: 21.477225077422165


In [11]:
df_st = pd.read_csv('Data/slump_test.txt',sep='\t')
perform_crossvalidation(df_st, 'SLUMP(cm)')

mse: 62.4683001757452, r2: 0.08927307721048724, mae: 6.482574773050885


In [13]:
df = pd.read_csv('Data/custom_dataset_poor_small.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 0.009882298302416548, r2: 0.9999999999984173, mae: 0.0792094551988691


In [14]:
df = pd.read_csv('Data/custom_dataset_poor_medium.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 0.9810124061413189, r2: 0.9999999997392284, mae: 0.7901179139608527


In [15]:
df = pd.read_csv('Data/custom_dataset_poor_large.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 25.212702413687953, r2: 0.9999996851502756, mae: 3.9943339122478236


In [16]:
df = pd.read_csv('Data/custom_dataset_rich_small.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 0.01015222533775766, r2: 0.999999999998451, mae: 0.0802525668823947


In [17]:
df = pd.read_csv('Data/custom_dataset_rich_medium.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 1.0000864263167935, r2: 0.999999999399113, mae: 0.7955176684396112


In [18]:
df = pd.read_csv('Data/custom_dataset_rich_large.txt',sep='\t')
perform_crossvalidation(df, 'y')

mse: 25.549638196673108, r2: 0.9999999754920941, mae: 4.049909569642503
