In [1]:
%matplotlib inline
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import svm

In [2]:
# Import cleaned data
df_train = pd.read_csv('cleaned_train_concrete.csv')
df_test = pd.read_csv('cleaned_test_concrete.csv')

In [3]:
# Split data into labels and features
X = df_train.drop(labels=['csMPa'], axis=1)
y = df_train['csMPa']
X_test = df_test.drop(labels=['csMPa'], axis=1)
y_test = df_test['csMPa']

In [4]:
# Scale data
scaler = MinMaxScaler() # Other options are RobustScaler(), StandardScaler()
X_train_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Use train-test split for training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.4, random_state=42)

In [6]:
# Fit models and test their efficiency
mae, mse, rmse, r2, algo = [], [], [], [], []

In [7]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_val)
mae.append(mean_absolute_error(y_val, y_pred))
mse.append(mean_squared_error(y_val, y_pred))
rmse.append(np.sqrt(mse[-1]))
r2.append(r2_score(y_val, y_pred))
algo.append('Linear Regression')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

# Use cross-validation for training set
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {'mae' : 'neg_mean_absolute_error', 'mse' : 'neg_mean_squared_error', 
          'rmse' : 'neg_root_mean_squared_error', 'r2' : 'r2'}
scores = cross_validate(regressor,X_train_scaled,np.array(y),scoring=scoring,cv=cv)
mae.append(np.mean(np.absolute(scores['test_mae'])))
mse.append(np.mean(np.absolute(scores['test_mse'])))
rmse.append(np.mean(np.absolute(scores['test_rmse'])))
r2.append(np.mean(np.absolute(scores['test_r2'])))
algo.append('Linear Regression KFold')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

Model:  Linear Regression
Mean Absolute Error:  8.327635117307333
Mean Squared Error:  109.15333656855329
Root Mean Squared Error:  10.447647417890463
R2 Score:  0.625170585128179 

Model:  Linear Regression KFold
Mean Absolute Error:  8.345327674996469
Mean Squared Error:  110.55233978182946
Root Mean Squared Error:  10.506426967760149
R2 Score:  0.6214594816675862 



In [8]:
# Ridge Regression
regressor = Ridge()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_val)
mae.append(mean_absolute_error(y_val, y_pred))
mse.append(mean_squared_error(y_val, y_pred))
rmse.append(np.sqrt(mse[-1]))
r2.append(r2_score(y_val, y_pred))
algo.append('Ridge Regression')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

# Use cross-validation for training set
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {'mae' : 'neg_mean_absolute_error', 'mse' : 'neg_mean_squared_error', 
          'rmse' : 'neg_root_mean_squared_error', 'r2' : 'r2'}
scores = cross_validate(regressor,X_train_scaled,np.array(y),scoring=scoring,cv=cv)
mae.append(np.mean(np.absolute(scores['test_mae'])))
mse.append(np.mean(np.absolute(scores['test_mse'])))
rmse.append(np.mean(np.absolute(scores['test_rmse'])))
r2.append(np.mean(np.absolute(scores['test_r2'])))
algo.append('Ridge Regression KFold')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

clf = GridSearchCV(regressor,{'alpha':[0,0.25,0.5,0.75,1]},scoring = scoring,cv=cv,refit='rmse')
clf.fit(X,y)
idx = clf.best_index_
mae.append(np.absolute(clf.cv_results_['mean_test_mae'][idx]))
mse.append(np.absolute(clf.cv_results_['mean_test_mse'][idx]))
rmse.append(np.absolute(clf.cv_results_['mean_test_rmse'][idx]))
r2.append(np.absolute(clf.cv_results_['mean_test_r2'][idx]))
algo.append('Ridge Regression GridCV')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

Model:  Ridge Regression
Mean Absolute Error:  8.477560312220762
Mean Squared Error:  111.04429068754922
Root Mean Squared Error:  10.537755486228992
R2 Score:  0.6186771031306997 

Model:  Ridge Regression KFold
Mean Absolute Error:  8.456988026460511
Mean Squared Error:  111.52723953029616
Root Mean Squared Error:  10.554527053090538
R2 Score:  0.6180168583589779 

Model:  Ridge Regression GridCV
Mean Absolute Error:  8.345327147900468
Mean Squared Error:  110.55229549484068
Root Mean Squared Error:  10.506424765312483
R2 Score:  0.6214596429631435 



In [9]:
# Lasso Regression
regressor = Lasso()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_val)
mae.append(mean_absolute_error(y_val, y_pred))
mse.append(mean_squared_error(y_val, y_pred))
rmse.append(np.sqrt(mse[-1]))
r2.append(r2_score(y_val, y_pred))
algo.append('Lasso Regression')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

# Use cross-validation for training set
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {'mae' : 'neg_mean_absolute_error', 'mse' : 'neg_mean_squared_error', 
          'rmse' : 'neg_root_mean_squared_error', 'r2' : 'r2'}
scores = cross_validate(regressor,X_train_scaled,np.array(y),scoring=scoring,cv=cv)
mae.append(np.mean(np.absolute(scores['test_mae'])))
mse.append(np.mean(np.absolute(scores['test_mse'])))
rmse.append(np.mean(np.absolute(scores['test_rmse'])))
r2.append(np.mean(np.absolute(scores['test_r2'])))
algo.append('Lasso Regression KFold')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

clf = GridSearchCV(regressor,{'alpha':[0.01,0.25,0.5,0.75,1]},scoring = scoring,cv=cv,refit='rmse')
clf.fit(X,y)
idx = clf.best_index_
mae.append(np.absolute(clf.cv_results_['mean_test_mae'][idx]))
mse.append(np.absolute(clf.cv_results_['mean_test_mse'][idx]))
rmse.append(np.absolute(clf.cv_results_['mean_test_rmse'][idx]))
r2.append(np.absolute(clf.cv_results_['mean_test_r2'][idx]))
algo.append('Lasso Regression GridCV')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

Model:  Lasso Regression
Mean Absolute Error:  11.99926344590078
Mean Squared Error:  225.89195026656398
Root Mean Squared Error:  15.0297022680612
R2 Score:  0.22429354700033888 

Model:  Lasso Regression KFold
Mean Absolute Error:  12.377029961514552
Mean Squared Error:  231.71619279717842
Root Mean Squared Error:  15.220415312082471
R2 Score:  0.20541146301409166 

Model:  Lasso Regression GridCV
Mean Absolute Error:  8.346731710896508
Mean Squared Error:  110.54834043808162
Root Mean Squared Error:  10.506204099689132
R2 Score:  0.6214776499235091 



In [10]:
# Support Vector Regression
regressor = svm.SVR()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_val)
mae.append(mean_absolute_error(y_val, y_pred))
mse.append(mean_squared_error(y_val, y_pred))
rmse.append(np.sqrt(mse[-1]))
r2.append(r2_score(y_val, y_pred))
algo.append('SVR')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

# Use cross-validation for training set
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {'mae' : 'neg_mean_absolute_error', 'mse' : 'neg_mean_squared_error', 
          'rmse' : 'neg_root_mean_squared_error', 'r2' : 'r2'}
scores = cross_validate(regressor,X_train_scaled,np.array(y),scoring=scoring,cv=cv)
mae.append(np.mean(np.absolute(scores['test_mae'])))
mse.append(np.mean(np.absolute(scores['test_mse'])))
rmse.append(np.mean(np.absolute(scores['test_rmse'])))
r2.append(np.mean(np.absolute(scores['test_r2'])))
algo.append('SVR KFold')
print('Model: ',algo[-1])
print('Mean Absolute Error: ',mae[-1])
print('Mean Squared Error: ',mse[-1])
print('Root Mean Squared Error: ',rmse[-1])
print('R2 Score: ',r2[-1],'\n')

# param_grid = [{'kernel':['rbf','poly'],'gamma':[1e-2,1e-4,1e-7],'C': [1, 10, 100, 1000]},
#              {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
# clf = GridSearchCV(regressor,param_grid=param_grid,scoring=scoring,cv=cv,refit='rmse')
# clf.fit(X,y)
# idx = clf.best_index_
# mae.append(np.absolute(clf.cv_results_['mean_test_mae'][idx]))
# mse.append(np.absolute(clf.cv_results_['mean_test_mse'][idx]))
# rmse.append(np.absolute(clf.cv_results_['mean_test_rmse'][idx]))
# r2.append(np.absolute(clf.cv_results_['mean_test_r2'][idx]))
# algo.append('SVR GridCV')
# print('Model: ',algo[-1])
# print('Mean Absolute Error: ',mae[-1])
# print('Mean Squared Error: ',mse[-1])
# print('Root Mean Squared Error: ',rmse[-1])
# print('R2 Score: ',r2[-1],'\n')

Model:  SVR
Mean Absolute Error:  9.353791803474994
Mean Squared Error:  137.79485854441546
Root Mean Squared Error:  11.738605476989823
R2 Score:  0.526816423352205 

Model:  SVR KFold
Mean Absolute Error:  8.808055974637131
Mean Squared Error:  121.91985305523698
Root Mean Squared Error:  11.040612171423465
R2 Score:  0.5817415053712841 

