In [1]:
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls -O
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Readme.txt -O

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  122k  100  122k    0     0   158k      0 --:--:-- --:--:-- --:--:--  160k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  3808  100  3808    0     0   9242      0 --:--:-- --:--:-- --:--:--  9356


In [2]:
!ls

Concrete-Compressive-Strength.ipynb  Concrete_Readme.txt
Concrete_Data.xls		     README.md


In [3]:
!cat Concrete_Readme.txt

Concrete Compressive Strength 

---------------------------------

Data Type: multivariate
 
Abstract: Concrete is the most important material in civil engineering. The 
concrete compressive strength is a highly nonlinear function of age and 
ingredients. These ingredients include cement, blast furnace slag, fly ash, 
water, superplasticizer, coarse aggregate, and fine aggregate.

---------------------------------

Sources: 

  Original Owner and Donor
  Prof. I-Cheng Yeh
  Department of Information Management 
  Chung-Hua University, 
  Hsin Chu, Taiwan 30067, R.O.C.
  e-mail:icyeh@chu.edu.tw
  TEL:886-3-5186511

  Date Donated: August 3, 2007
 
---------------------------------

Data Characteristics:
    
The actual concrete compressive strength (MPa) for a given mixture under a 
specific age (days) was determined from laboratory. Data is in raw form (not scaled). 

Summary Statistics: 

Number of instances (observat

In [4]:
#Mean imputation isn't needed bcause there is no missing values
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
df = pd.read_excel("Concrete_Data.xls", names=["Cement", "Blast Furnace Slag", "Fly Ash", "Water", "Superplasticizer", "Coarse Aggregate", "Fine Aggregate", "Age", "Concrete compressive strength"])

In [7]:
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [None]:
df.dtypes

In [1]:
#Making everything floats so there is no variation in the data that may mess with the model later
df["Age"] = df["Age"].astype(float)

NameError: name 'df' is not defined

In [2]:
df.dtypes

NameError: name 'df' is not defined

In [3]:
#Remove answer to not mess with feature selection and for test/train split later
from sklearn.utils.multiclass import type_of_target
x = df.drop("Concrete compressive strength", axis = 1)
y = df["Concrete compressive strength"]

NameError: name 'df' is not defined

In [4]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier

In [5]:
#Binning to remove noisy data and squash outliers, done on all attributes
def binning(string):
    split = [[v] for v in x[string]]
    est.fit(split)
    transformed = est.transform(split)
    x[string] = transformed

est = KBinsDiscretizer(n_bins = 5, encode = "ordinal", strategy = "quantile") #Splits data into 5 equal bins then computes their average
names = list(x)
for string in names:
    binning(string)

x_bin = x.copy()

NameError: name 'x' is not defined

In [6]:
#Used to check if feature selection is removing the insignificant and correlated data
ax, fig = plt.subplots(figsize = (16, 10))
correlation_matrix = x_bin.corr()
sns.heatmap(correlation_matrix, annot = True, cmap = "OrRd")
plt.show()

NameError: name 'plt' is not defined

In [7]:
#Feature selection using kbest to find attributes of the concrete that are statistically significant when calculating its strength
def feature_sel(x):
    enc = LabelEncoder()
    strength = enc.fit_transform(y)
    clf = DecisionTreeClassifier()
    #Using 3-fold cross validation decision tree
    rfecv = RFECV(estimator = clf, step = 1, cv = StratifiedKFold(3), scoring = "accuracy", min_features_to_select = 2)
    rfecv.fit(x, strength)
    
    greatest = 1
    count = 0
    remove = ""
    features = list(x.columns)
    print("Optimal number of features: ", rfecv.n_features_) #We can see if each iteration causes the number of optimal features to change and if this corresponds to the number removed
    
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
    
    for i in range(0, len(features)):
        if rfecv.ranking_[i] > 1:
            count += 1
        if rfecv.ranking_[i] > greatest:
            greatest == rfecv.ranking_[i]
            remove = ""
            remove += features[i]
    if remove != "":
        finish = x.drop(remove, axis = 1)
        remove = ""
        return finish, count
    else:
        return x, 0
    
x_binFeature, check = feature_sel(x_bin)
count = 0
while check > 0: #This removes the least significant attribute then runs the feature selection again to see if there is still a reason to remove a data point (still insignificant)
    x_binFeature, check = feature_sel(x_binFeature) 
    count += 1
    
print("Features beforehand:", 8, ", features removed :", count, ", end optimal number of features : ", 8 - count) #If final optimal features = 6 then we expect 10 - 6 to be removed EXAMPLE

NameError: name 'x_bin' is not defined

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
#Train on 70% of the data and test on 30%
train_x, test_x, train_y, test_y = train_test_split(x_binFeature, y, test_size = 0.3)

NameError: name 'x_binFeature' is not defined

In [10]:
from sklearn.linear_model import SGDRegressor

In [11]:
#Using a Linear model with Stochastic Gradient Descent to minimise chance that model gets stuck in local minima 
model = SGDRegressor()

model.fit(train_x, train_y)

NameError: name 'train_x' is not defined

In [12]:
model.coef_

predicted = model.predict(test_x)
plt.plot(test_y, predicted) #Visualisation of model prediction accuracy

AttributeError: 'SGDRegressor' object has no attribute 'coef_'

In [13]:
model.score(test_x, test_y)

NameError: name 'test_x' is not defined

In [None]:
#Compared to auto-mpg which I got an accuracy of 0.9959822026464998, this model is more inconsistent and less precise