# Import Libraries

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn. preprocessing import FunctionTransformer, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score
import zipfile
from xgboost import XGBRegressor


This notebook is working with percentage of total data to predict compressive strength. Age is not included

In [8]:
#load data 
with zipfile.ZipFile('concrete+compressive+strength.zip') as z:
    with z.open('Concrete_Data.xls') as f:
        data = pd.read_excel(f)

In [9]:
#rename some of the column names
data.columns = ['Cement', 'Blast_Furnace_Slag', 'Fly_Ash', 'Water', 'Superplasticizer', 'Coarse_Aggregate', 'Fine_Aggregate', 'Age (Days)', 'Compressive_Strength (MPa)']

In [11]:
#drop Age (days)
data = data.drop('Age (Days)', axis=1)

In [12]:
#drop duplicates
data = data.drop_duplicates()

In [13]:
#make function to calculate total
def get_percentages(dataframe):
    my_kg = dataframe.iloc[:, :-1].sum(axis=1)
    my_copy = dataframe.copy()
    my_copy.iloc[:, :-1] = dataframe.iloc[:, :-1].div(my_kg, axis=0) * 100
    return my_copy

In [14]:
#get df of percentage of value per total
percentage_df = get_percentages(data)
percentage_df

Unnamed: 0,Cement,Blast_Furnace_Slag,Fly_Ash,Water,Superplasticizer,Coarse_Aggregate,Fine_Aggregate,Compressive_Strength (MPa)
0,22.309440,0.000000,0.000000,6.692832,0.103284,42.966329,27.928114,79.986111
1,22.172039,0.000000,0.000000,6.651612,0.102648,43.317594,27.756108,61.887366
2,14.917003,6.393001,0.000000,10.228802,0.000000,41.812472,26.648721,40.269535
3,14.917003,6.393001,0.000000,10.228802,0.000000,41.812472,26.648721,41.052780
4,8.534961,5.689974,0.000000,8.251322,0.000000,42.047359,35.476385,44.296075
...,...,...,...,...,...,...,...,...
1025,11.967440,5.022515,3.909768,7.776238,0.385348,37.673190,33.265501,44.284354
1026,14.159525,0.000000,5.080202,8.613492,0.457042,35.943749,35.745990,31.178794
1027,6.548485,6.147198,4.788993,8.497597,0.268995,39.352648,34.396084,23.696601
1028,6.883870,8.078055,0.000000,7.597785,0.488924,42.817584,34.133783,32.768036


In [15]:
#declare variables and split data

X_var = data.iloc[:, 0:7]
y_var = data['Compressive_Strength (MPa)']

X_train, X_test, y_train, y_test = train_test_split(X_var, y_var, test_size=0.3, shuffle=True)

In [16]:
#function to eval models
def eval_model(actual_data, predict_data):
    r2 = r2_score(actual_data, predict_data)
    mae = mean_absolute_error(actual_data, predict_data)
    mse = mean_squared_error(actual_data, predict_data)
    explained_variance = explained_variance_score(actual_data, predict_data)
    print(f"R2: {r2}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"Explained Variance: {explained_variance}")

In [17]:
#winsorization function
def winsorization(dataframe, lower_limit, upper_limit, axis=None):
    clipped = dataframe.clip(dataframe.quantile(lower_limit), dataframe.quantile(upper_limit), axis=axis)
    return clipped 

Apply Winsorization to percentages df

In [18]:
#declare columns to winsorize
columns_to_winsorize = 
columns_to_transform = list(percentage_df.columns)

#declare transformers
winsorizer = FunctionTransformer(winsorization, columns_to_transform, kw_args={'lower_limit': 0, 'upper_limit': 0.99, 'axis': 1})
quartiles = QuantileTransformer(output_distribution='normal')

#declare transformer
transformer = ColumnTransformer(transformers=[
    ('winsorize', winsorizer, columns_to_winsorize),
    ('quartiles', quartiles, columns_to_winsorize)
])

['Cement',
 'Blast_Furnace_Slag',
 'Fly_Ash',
 'Water',
 'Superplasticizer',
 'Coarse_Aggregate',
 'Fine_Aggregate',
 'Compressive_Strength (MPa)']

# Base model
Raw data compare standard scaler and robust scaler

In [20]:
#pipeline
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor())
])

In [21]:
#fit pipeline
pipeline.fit(X_train, y_train)

In [22]:
#get predictions
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)

In [23]:
#eval train model
eval_model(y_train, train_pred)

R2: 0.6828260460867349
MAE: 6.6368754451262655
MSE: 79.84950172543535
Explained Variance: 0.6828388678496158


In [24]:
#eval test model
eval_model(y_test, test_pred)

R2: 0.39913229397416194
MAE: 10.548173606171193
MSE: 177.4385067809451
Explained Variance: 0.4020269802412223


Yikes...

In [25]:
#pipeline with Robust Scaler
#pipeline
pipeline = Pipeline(steps=[
    ('scaler', RobustScaler()),
    ('regressor', RandomForestRegressor())
])

In [26]:
#fit pipeline
pipeline.fit(X_train, y_train)

In [27]:
#get predictions
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)

In [28]:
#eval train model
eval_model(y_train, train_pred)

R2: 0.6832211142643063
MAE: 6.602212516253485
MSE: 79.75004211742707
Explained Variance: 0.6832211413906625


In [29]:
#eval test model
eval_model(y_test, test_pred)

R2: 0.398851865181419
MAE: 10.546296154363553
MSE: 177.52131846435506
Explained Variance: 0.40215482195299557


Not much of a difference