In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
train_data = pd.read_csv('train_concrete.csv')
test_data = pd.read_csv('test_concrete.csv')
train_data.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,425.0,106.3,0.0,153.5,16.5,852.1,887.1,28,60.29
1,212.1,0.0,121.6,180.3,5.7,1057.6,779.3,100,39.61
2,165.0,0.0,143.6,163.8,0.0,1005.6,900.9,28,26.2
3,260.9,100.5,78.3,200.6,8.6,864.5,761.5,28,32.4
4,540.0,0.0,0.0,173.0,0.0,1125.0,613.0,7,52.61


In [3]:
test_data.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,349.0,0.0,0.0,192.0,0.0,1047.0,806.0,360,42.13
1,276.0,116.0,90.0,180.0,9.0,870.0,768.0,28,44.28
2,139.7,163.9,127.7,236.7,5.8,868.6,655.6,28,35.23
3,296.0,0.0,107.0,221.0,11.0,819.0,778.0,28,31.42
4,385.0,0.0,0.0,186.0,0.0,966.0,763.0,14,27.92


In [4]:
corr = train_data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
cement,1.0,-0.272043,-0.394003,-0.08736,0.089716,-0.105651,-0.226412,0.090677,0.507032
slag,-0.272043,1.0,-0.320389,0.088269,0.080963,-0.282486,-0.277276,-0.056109,0.148958
flyash,-0.394003,-0.320389,1.0,-0.248135,0.364164,-0.017112,0.064488,-0.135302,-0.107144
water,-0.08736,0.088269,-0.248135,1.0,-0.652254,-0.175694,-0.421878,0.284981,-0.300018
superplasticizer,0.089716,0.080963,0.364164,-0.652254,1.0,-0.271644,0.192139,-0.185956,0.372355
coarseaggregate,-0.105651,-0.282486,-0.017112,-0.175694,-0.271644,1.0,-0.19046,-0.015311,-0.165144
fineaggregate,-0.226412,-0.277276,0.064488,-0.421878,0.192139,-0.19046,1.0,-0.171049,-0.187759
age,0.090677,-0.056109,-0.135302,0.284981,-0.185956,-0.015311,-0.171049,1.0,0.329084
csMPa,0.507032,0.148958,-0.107144,-0.300018,0.372355,-0.165144,-0.187759,0.329084,1.0


In [5]:
X = train_data.drop(labels=['csMPa'], axis=1)
y = train_data['csMPa']
X_test = test_data.drop(labels=['csMPa'], axis=1)
y_test = test_data['csMPa']

In [6]:
len(X_test)

206

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=69)

In [8]:
n_estimators = [10,50,100,200,300,500]
max_depths = [1,2,3,4,5,6,7]
mses = {}

for n_estimator in n_estimators:
    for max_depth in max_depths:
        regressor = RandomForestRegressor(n_estimators=n_estimator, max_depth=max_depth)
        regressor.fit(X_train, y_train)
        y_out = regressor.predict(X_val)
        mse = mean_squared_error(y_val,y_out)
        mses[f'estimators {n_estimator}, max_depths {max_depth}'] = mse

In [14]:
min_value = np.min(list(mses.values()))
best_hp = list(mses.keys())[list(mses.values()).index(min_value)]
print("Best HPs", best_hp)
n_estimators = best_hp.split(' ')[1].replace(',','')
max_depth = best_hp.split(' ')[3]

regressor = RandomForestRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth))
regressor.fit(X_train, y_train)
y_out = regressor.predict(X_val)
print(f'Validation error: {np.sqrt(mean_squared_error(y_val,y_out))}')
y_out = regressor.predict(X_test)
print(f'Test error: {np.sqrt(mean_squared_error(y_test,y_out))}')

Best HPs estimators 300, max_depths 7
Validation error: 6.118540214651163
Test error: 5.68091521448381


In [15]:
grid = GridSearchCV(estimator=RandomForestRegressor(), param_grid={'n_estimators': [10,50,99,200,300,500], 'max_depth':[1,2,3,4,5,6,7]}, cv=5)
grid.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7],
                         'n_estimators': [10, 50, 99, 200, 300, 500]})

In [16]:
grid.best_estimator_

RandomForestRegressor(max_depth=7, n_estimators=99)

In [17]:
def evaluate(model, test_features, test_labels):
    y_out = model.predict(test_features)
    mse = mean_squared_error(test_labels,y_out)
    rmse = np.sqrt(mse)
    
    print('Model Performance')
    print(f'Root Mean Squared Error: {rmse}')

In [18]:
evaluate(grid.best_estimator_, X_val, y_val)
evaluate(grid.best_estimator_, X_test, y_test)

Model Performance
Root Mean Squared Error: 6.151391345419112
Model Performance
Root Mean Squared Error: 5.662935554411537
