# Import Libraries

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns
import zipfile

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

# Import Data

In [2]:
#load data 
with zipfile.ZipFile('concrete+compressive+strength.zip') as z:
    with z.open('Concrete_Data.xls') as f:
        data = pd.read_excel(f)

In [3]:
#drop duplicates
data = data.drop_duplicates()

In [6]:
data.columns = ['Cement', 'Blast_Furnace_Slag', 'Fly_Ash',
                'Water', 'Superplasticizer', 'Coarse_Aggregate',
                'Fine_Aggregate', 'Age (Days)', 'Compressive_Strength (MPa)']

In [21]:
#make function to calculate total
def get_percentages(dataframe):
    my_kg = dataframe.iloc[:, :-1].sum(axis=1)
    my_copy = dataframe.copy()
    my_copy.iloc[:, :-2] = dataframe.iloc[:, :-2].div(my_kg, axis=0) * 100
    return my_copy

#get df of percentage of value per total
percentage_df = get_percentages(data)
percentage_df

Unnamed: 0,Cement,Blast_Furnace_Slag,Fly_Ash,Water,Superplasticizer,Coarse_Aggregate,Fine_Aggregate,Age (Days),Compressive_Strength (MPa)
0,22.054319,0.000000,0.000000,6.616296,0.102103,42.474985,27.608740,28,79.986111
1,21.920032,0.000000,0.000000,6.576010,0.101482,42.825249,27.440633,28,61.887366
2,13.305322,5.702281,0.000000,9.123649,0.000000,37.294918,23.769508,270,40.269535
3,12.818042,5.493446,0.000000,8.789514,0.000000,35.929067,22.898998,365,41.052780
4,7.391418,4.927612,0.000000,7.145781,0.000000,36.413711,30.723138,360,44.296075
...,...,...,...,...,...,...,...,...,...
1025,11.824093,4.962355,3.862936,7.683094,0.380732,37.221937,32.867043,28,44.284354
1026,13.987410,0.000000,5.018450,8.508791,0.451487,35.506837,35.311483,28,31.178794
1027,6.468615,6.072222,4.730583,8.393954,0.265714,38.872675,33.976565,28,23.696601
1028,6.801471,7.981361,0.000000,7.506840,0.483071,42.305062,33.725205,28,32.768036


In [20]:
#functions to declare and split data

def declare_var(dataframe, target):
    X = dataframe.drop(target, axis=1)
    y = dataframe[target]
    return X, y

def split_data(X_var, y_var, testing_size):
    X_train, X_test, y_train, y_split = train_test_split(X_var, y_var, test_size=testing_size, shuffle=True)
    return X_train, X_test, y_train, y_split

def print_cross_scores(scores):
    for i in range(len(scores)):
        print(f"Fold {i+1}: ", '{:.4f}'.format(scores[i]))

In [11]:
#declare var and split data for raw data
X, y = declare_var(data, 'Compressive_Strength (MPa)')

X_train, X_test, y_train, y_test = split_data(X, y, 0.3)

In [22]:
#delcar var and split data for precentage_df
X_percent, y_percent = declare_var(percentage_df, 'Compressive_Strength (MPa)')

percent_train, percent_test, target_train, target_test = split_data(X_percent, y_percent, 0.3)

# Linear Regression

## Base model

In [26]:
#pipeline
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])

In [13]:
#fit data
pipeline.fit(X_train, y_train)

In [19]:
#cross val scores for raw data
raw_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

print_cross_scores(raw_scores)

Fold 1:  0.5407
Fold 2:  0.6401
Fold 3:  0.5989
Fold 4:  0.5898
Fold 5:  0.5609


In [27]:
#fit pipeline on percent data
pipeline.fit(percent_train, target_train)

In [28]:
#cross val scores for percentage _df
percent_scores = cross_val_score(pipeline, percent_train, target_train)
print_cross_scores(percent_scores)

Fold 1:  0.6077
Fold 2:  0.5920
Fold 3:  0.6957
Fold 4:  0.6960
Fold 5:  0.7432


Well the base model does not have great cross validation scores for the raw data, but it does perform better on the percentage_df

# Random Forest

In [31]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor())
])

In [32]:
#fit pipeline on raw data
pipeline.fit(X_train, y_train)

In [33]:
raw_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print_cross_scores(raw_scores)

Fold 1:  0.8674
Fold 2:  0.8612
Fold 3:  0.8729
Fold 4:  0.8673
Fold 5:  0.9062


In [34]:
#fit pipeline on percent data
pipeline.fit(percent_train, target_train)

In [35]:
#cross val scores
percent_scores = cross_val_score(pipeline, percent_train, target_train, cv=5)
print_cross_scores(percent_scores)

Fold 1:  0.8649
Fold 2:  0.8457
Fold 3:  0.8934
Fold 4:  0.8860
Fold 5:  0.8869


Mhmm, the RandomForest appears much better, but the percent data performed worse. RandomForest shall be explored further.