In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("concrete_data.csv")

In [4]:
data.sample(5)

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
517,202.0,11.0,141.0,206.0,1.7,942.0,801.0,7,15.07
362,218.2,54.6,123.8,140.8,11.9,1075.7,792.7,56,61.99
563,210.7,316.1,0.0,185.7,0.0,977.0,689.3,7,21.82
746,500.0,0.0,0.0,200.0,0.0,1125.0,613.0,1,12.64
694,236.0,157.0,0.0,192.0,0.0,972.6,749.1,28,32.88


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   cement                         1030 non-null   float64
 1   blast_furnace_slag             1030 non-null   float64
 2   fly_ash                        1030 non-null   float64
 3   water                          1030 non-null   float64
 4   superplasticizer               1030 non-null   float64
 5   coarse_aggregate               1030 non-null   float64
 6   fine_aggregate                 1030 non-null   float64
 7   age                            1030 non-null   int64  
 8   concrete_compressive_strength  1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB


In [7]:
data.isnull().sum()

cement                           0
blast_furnace_slag               0
fly_ash                          0
water                            0
superplasticizer                 0
coarse_aggregate                 0
fine_aggregate                   0
age                              0
concrete_compressive_strength    0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [10]:
X=data.drop(columns='concrete_compressive_strength')

In [11]:
Y=data['concrete_compressive_strength']

In [12]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [13]:
X_train.sample(5)

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age
288,181.4,0.0,167.0,169.6,7.6,1055.6,777.8,100
315,251.8,0.0,99.9,146.1,12.4,1006.0,899.8,14
460,178.0,129.8,118.6,179.9,3.6,1007.3,746.8,100
182,362.6,189.0,0.0,164.9,11.6,944.7,755.8,91
144,475.0,118.8,0.0,181.1,8.9,852.1,781.5,56


In [19]:
num_cols=X.select_dtypes(include=['int64','float64']).columns

In [20]:
trf1=ColumnTransformer([
    ("trf1",StandardScaler(),num_cols)
],remainder='passthrough')

In [21]:
trf2=DecisionTreeRegressor()

In [22]:
pipe=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2)
])

In [23]:
pipe.fit(X_train,Y_train)

In [24]:
Y_pred=pipe.predict(X_test)

In [25]:
r2=r2_score(Y_test,Y_pred)
MSE=mean_squared_error(Y_test,Y_pred)
MAE=mean_absolute_error(Y_test,Y_pred)

In [26]:
print("r2 score is",r2)
print("MSE is",MSE)
print("MAE is",MAE)

r2 score is 0.8212643573593421
MSE is 48.36126537216829
MAE is 4.466343042071197


In [30]:
param_grid = {
    'trf2__max_depth': [3,5,7,10,15,20,None],
    'trf2__min_samples_split': [2,5,10,20,50],
    'trf2__min_samples_leaf': [1,2,5,10,20],
    'trf2__max_features': [None, 'sqrt', 'log2'],
    'trf2__criterion': ['squared_error', 'absolute_error']
}


In [48]:
grid=GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

In [49]:
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 1050 candidates, totalling 5250 fits


In [50]:
best_model=grid.best_estimator_
best_params=grid.best_params_
best_score=grid.best_score_

In [51]:
print(best_params)
print(best_score)

{'trf2__criterion': 'absolute_error', 'trf2__max_depth': 20, 'trf2__max_features': None, 'trf2__min_samples_leaf': 1, 'trf2__min_samples_split': 2}
0.8156067483445797


In [52]:
Y_pred1=best_model.predict(X_test)

In [53]:
r2=r2_score(Y_test,Y_pred1)
MSE=mean_squared_error(Y_test,Y_pred1)
MAE=mean_absolute_error(Y_test,Y_pred1)

In [54]:
print("r2 score is",r2)
print("MSE is",MSE)
print("MAE is",MAE)

r2 score is 0.8114913512852042
MSE is 51.005589320388346
MAE is 4.800938511326861
