In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Set display options to show all columns and rows
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

#Linear regression models import
from sklearn.linear_model import LinearRegression

# to split the data
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('concrete.csv')
df.head()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,141.3,212.0,0.0,203.5,0.0,971.8,748.5,28,29.89
1,168.9,42.2,124.3,158.3,10.8,1080.8,796.2,14,23.51
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,28,29.22
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,154.8,183.4,0.0,193.3,9.1,1047.4,696.7,28,18.29


In [3]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale

df["cement"] = std_scale.fit_transform(df[["cement"]])
df["slag"] = std_scale.fit_transform(df[["slag"]])
df["ash"] = std_scale.fit_transform(df[["ash"]])
df["water"] = std_scale.fit_transform(df[["water"]])
df["superplastic"] = std_scale.fit_transform(df[["superplastic"]])
df["coarseagg"] = std_scale.fit_transform(df[["coarseagg"]])
df["fineagg"] = std_scale.fit_transform(df[["fineagg"]])
df["age"] = std_scale.fit_transform(df[["age"]])
df["strength"] = std_scale.fit_transform(df[["strength"]])

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Assigned X and Y
X = df.drop("strength", axis = 1)
Y = df[["strength"]]

# Split train test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

model = Sequential()

# Units is number of neurons
# Adding input layer
# input_dim is number of independent variables
model.add(Dense(units = 20, activation = "relu", input_dim = 8, kernel_initializer='uniform'))
# Adding hidden layers (Hidden Layers includes input layer but input_dim is passed for input)
model.add(Dense(units = 20, activation = "relu", kernel_initializer='uniform'))
model.add(Dense(units = 20, activation = "relu", kernel_initializer='uniform'))
model.add(Dense(units = 20, activation = "relu", kernel_initializer='uniform'))
# Adding output layer
# For multi class use softmax for binary use sigmoid
model.add(Dense(units = 1))

model.compile(optimizer = "adam", loss = "mean_squared_error")

model.fit(X_train, Y_train, batch_size = 10, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x145c54210>

In [9]:
from sklearn.metrics import r2_score
Y_pred = model.predict(X_test)
r2_score(Y_test, Y_pred)



0.8290412716322604

In [3]:
df.shape

(1030, 9)

In [5]:
df.dtypes

cement          float64
slag            float64
ash             float64
water           float64
superplastic    float64
coarseagg       float64
fineagg         float64
age               int64
strength        float64
dtype: object

In [6]:
df.isnull().sum()

cement          0
slag            0
ash             0
water           0
superplastic    0
coarseagg       0
fineagg         0
age             0
strength        0
dtype: int64

In [7]:
df.isna().sum()

cement          0
slag            0
ash             0
water           0
superplastic    0
coarseagg       0
fineagg         0
age             0
strength        0
dtype: int64

In [8]:
cleaned_df = df

In [9]:
numeric_columns = df.select_dtypes(include='number')

def remove_outlier(col):
    sorted(col)
    Q1,Q3 = col.quantile([0.25,0.75])
    IQR = Q3-Q1
    lower_range = Q1-1.5*IQR
    higher_range = Q3+1.5*IQR
    return lower_range, higher_range

outliers_columns = numeric_columns

cleaned_df = df
for outlier_col in outliers_columns:
    lower, upper = remove_outlier(df[outlier_col])
    cleaned_df[outlier_col] = np.where(cleaned_df[outlier_col]>upper, upper, cleaned_df[outlier_col])
    cleaned_df[outlier_col] = np.where(cleaned_df[outlier_col]<lower, lower, cleaned_df[outlier_col])
    
cleaned_df.sample(5)

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
988,238.0,0.0,0.0,185.0,0.0,1118.0,789.0,28.0,17.54
386,213.8,98.1,24.5,181.7,6.7,1066.0,785.5,56.0,47.13
777,376.0,0.0,0.0,214.6,0.0,1003.5,762.4,56.0,36.3
337,286.3,200.9,0.0,144.7,11.2,1004.6,803.7,91.0,76.8
238,149.0,153.0,194.0,192.0,8.0,935.0,623.0,28.0,24.58


In [10]:
# KNN Model 
Y = cleaned_df[["strength"]]
X = cleaned_df.drop(["strength"], axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 1)

In [10]:
from sklearn.neighbors import KNeighborsRegressor
for n in [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    model_knn = KNeighborsRegressor(n_neighbors=n)
    model_knn.fit(X_train, Y_train)
    print("\n n_neighbors: usual ", n)
    print("Train:: ",  model_knn.score(X_train,Y_train))
    print("Test:: ", model_knn.score(X_test, Y_test))


 n_neighbors: usual  5
Train::  0.7895227158003622
Test::  0.7044570736421931

 n_neighbors: usual  6
Train::  0.7676299715891652
Test::  0.7016036091533768

 n_neighbors: usual  7
Train::  0.7543294959480524
Test::  0.705946667733443

 n_neighbors: usual  8
Train::  0.7443693590796229
Test::  0.7009947537224389

 n_neighbors: usual  9
Train::  0.7262315028447269
Test::  0.6903974876803508

 n_neighbors: usual  10
Train::  0.7095360143991752
Test::  0.6731700424458986

 n_neighbors: usual  11
Train::  0.6968675999255034
Test::  0.6644165203069112

 n_neighbors: usual  12
Train::  0.6864764078277574
Test::  0.6536088965263288

 n_neighbors: usual  13
Train::  0.6797076881618154
Test::  0.6398126497198666

 n_neighbors: usual  14
Train::  0.6717631013918635
Test::  0.6285571757713768

 n_neighbors: usual  15
Train::  0.6595442468752452
Test::  0.619869410844651

 n_neighbors: usual  16
Train::  0.6509409778821134
Test::  0.6093736787478966

 n_neighbors: usual  17
Train::  0.64388359809

In [11]:
from sklearn.neighbors import KNeighborsRegressor
for n in [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    model_knn = KNeighborsRegressor(n_neighbors=n, p=1)
    model_knn.fit(X_train, Y_train)
    print("\n n_neighbors: usual p=1 ", n)
    print("Train:: ",  model_knn.score(X_train,Y_train))
    print("Test:: ", model_knn.score(X_test, Y_test))


 n_neighbors: usual p=1  5
Train::  0.7766348073161424
Test::  0.6970609601410619

 n_neighbors: usual p=1  6
Train::  0.7746279419171644
Test::  0.6981037819080547

 n_neighbors: usual p=1  7
Train::  0.7674182210140682
Test::  0.687794575728133

 n_neighbors: usual p=1  8
Train::  0.7574264020099873
Test::  0.6842314643659091

 n_neighbors: usual p=1  9
Train::  0.7475247441246728
Test::  0.6859961796797565

 n_neighbors: usual p=1  10
Train::  0.7377337211502262
Test::  0.6754776670541989

 n_neighbors: usual p=1  11
Train::  0.7258036043747207
Test::  0.6693895784198445

 n_neighbors: usual p=1  12
Train::  0.7118886689919988
Test::  0.6610987369409523

 n_neighbors: usual p=1  13
Train::  0.6984832115105015
Test::  0.6455272141638014

 n_neighbors: usual p=1  14
Train::  0.6880407591123201
Test::  0.6425429633240549

 n_neighbors: usual p=1  15
Train::  0.6774666240125027
Test::  0.6378863380089933

 n_neighbors: usual p=1  16
Train::  0.6737902266893779
Test::  0.632510982390070

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

Y=df["strength"]
# X=df.drop(["test"])
X = df.drop(["strength"], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 1)

# Instantiate the model
model_dtr = DecisionTreeRegressor(max_depth=5)
model_dtr.fit(X_train, Y_train)
# Fit the model with the train data
print("DecisionTreeRegression")
print("Test::", model_dtr.score(X_test, Y_test))
print("Train::", model_dtr.score(X_train, Y_train))

# DecisionTreeRegression
# Test:: 0.8292300666858522
# Train:: 0.9434268066005124

# DecisionTreeRegression - No
# Test:: 0.8677230116874993
# Train:: 0.9946689474106676
# df["strength"]
# 
# model_dtr = DecisionTreeClassifier(max_depth=2)
# model_dtr.fit(X_train, Y_train)
# # Fit the model with the train data
# print("DecisionTreeClassifier")
# print("Test::", model_dtr.score(X_test, Y_test))
# print("Train::", model_dtr.score(X_train, Y_train))

DecisionTreeRegression
Test:: 0.7291713850821487
Train:: 0.8015312828740121


In [14]:
from sklearn import tree


train_char_label = df["strength"].unique()
# train_char_label
Credit_Tree_File = open('concrete','w')
dot_data = tree.export_graphviz(model_dtr, out_file=Credit_Tree_File, feature_names = list(X_train), class_names = list(train_char_label))
Credit_Tree_File.close()
# Check in concrete.text copy http://www.webgraphviz.com/?tab=map in generate graph

In [100]:
# fig = plt.figure(figsize=(25,20))
# _ = model_dtr.plot_tree(clf, 
#                    feature_names=iris.feature_names,  
#                    class_names=iris.target_names,
#                    filled=True)


In [15]:
cleaned_Y = cleaned_df[["strength"]]
cleaned_X = cleaned_df.drop(["strength"], axis = 1)
X_cleaned_train, X_cleaned_test, Y_cleaned_train, Y_cleaned_test = train_test_split(cleaned_X, cleaned_Y, test_size = 0.30, random_state = 1)

In [13]:
from sklearn.neighbors import KNeighborsRegressor
for n in [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    model_knn = KNeighborsRegressor(n_neighbors=n)
    model_knn.fit(X_train, Y_train)
    print("\n Outlier removed n_neighbors: ", n)
    print("Train:: ",  model_knn.score(X_train,Y_train))
    print("Test:: ", model_knn.score(X_test, Y_test))


 Outlier removed n_neighbors:  5
Train::  0.7895227158003622
Test::  0.7044570736421931

 Outlier removed n_neighbors:  6
Train::  0.7676299715891652
Test::  0.7016036091533768

 Outlier removed n_neighbors:  7
Train::  0.7543294959480524
Test::  0.705946667733443

 Outlier removed n_neighbors:  8
Train::  0.7443693590796229
Test::  0.7009947537224389

 Outlier removed n_neighbors:  9
Train::  0.7262315028447269
Test::  0.6903974876803508

 Outlier removed n_neighbors:  10
Train::  0.7095360143991752
Test::  0.6731700424458986

 Outlier removed n_neighbors:  11
Train::  0.6968675999255034
Test::  0.6644165203069112

 Outlier removed n_neighbors:  12
Train::  0.6864764078277574
Test::  0.6536088965263288

 Outlier removed n_neighbors:  13
Train::  0.6797076881618154
Test::  0.6398126497198666

 Outlier removed n_neighbors:  14
Train::  0.6717631013918635
Test::  0.6285571757713768

 Outlier removed n_neighbors:  15
Train::  0.6595442468752452
Test::  0.619869410844651

 Outlier removed

In [16]:
from sklearn.tree import DecisionTreeRegressor


criterions = ["squared_error", "friedman_mse", "absolute_error", "poisson"]
for cr in criterions:
    model_dtr = DecisionTreeRegressor(max_depth=4, criterion=cr)
    model_dtr.fit(X_train, Y_train)
    # Fit the model with the train data
    print("DecisionTreeClassifier, criterion:", cr)
    print("Test::", model_dtr.score(X_test, Y_test))
    print("Train::", model_dtr.score(X_train, Y_train))

DecisionTreeClassifier, criterion: squared_error
Test:: 0.6569533046319993
Train:: 0.7259007389310271
DecisionTreeClassifier, criterion: friedman_mse
Test:: 0.6569533046319993
Train:: 0.7259007389310271
DecisionTreeClassifier, criterion: absolute_error
Test:: 0.6569350069162025
Train:: 0.7217723108935595
DecisionTreeClassifier, criterion: poisson
Test:: 0.6289348350788969
Train:: 0.7455443612511162


In [17]:
arr = [25,30,46,45,52,23,43,35,38,46,48,52,44,30]
np.std(arr)

9.321086474291743

In [18]:
# arr.to_list.mean()
sunny = [25,30,35,38,48]
np.std(sunny)

7.782030583337487

In [19]:
overcast = [46,43,52,44]
np.std(overcast)

3.491060010942235

In [91]:
rainy = [45,52,23,30,46]
np.std(rainy)

10.870142593360955

In [20]:
for n in [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    model_knn = KNeighborsRegressor(n_neighbors=n, p=2, weights="distance")
    model_knn.fit(X_train, Y_train)
    print("\n Outlier removed n_neighbors: usual ", n)
    print("Train:: ",  model_knn.score(X_train,Y_train))
    print("Test:: ", model_knn.score(X_test, Y_test))


 Outlier removed n_neighbors: usual  5
Train::  0.9946689474106676
Test::  0.7775874582234754

 Outlier removed n_neighbors: usual  6
Train::  0.9946689474106676
Test::  0.7807767125134069

 Outlier removed n_neighbors: usual  7
Train::  0.9946689474106676
Test::  0.7829576591024542

 Outlier removed n_neighbors: usual  8
Train::  0.9946689474106676
Test::  0.7821517218739441

 Outlier removed n_neighbors: usual  9
Train::  0.9946689474106676
Test::  0.781949339253146

 Outlier removed n_neighbors: usual  10
Train::  0.9946689474106676
Test::  0.774573970303706

 Outlier removed n_neighbors: usual  11
Train::  0.9946689474106676
Test::  0.7695712659258905

 Outlier removed n_neighbors: usual  12
Train::  0.9946689474106676
Test::  0.762745735401654

 Outlier removed n_neighbors: usual  13
Train::  0.9946689474106676
Test::  0.7546072870950873

 Outlier removed n_neighbors: usual  14
Train::  0.9946689474106676
Test::  0.7492885409547902

 Outlier removed n_neighbors: usual  15
Train::

In [21]:
df.describe()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.891893,54.18835,181.543252,6.159029,972.918932,773.439587,38.070388,35.81267
std,104.506364,86.266363,63.997004,21.225052,5.802457,77.753954,79.815303,35.782271,16.691447
min,102.0,0.0,0.0,124.25,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,357.375,200.1,232.65,25.5,1145.0,963.575,129.5,79.7725


In [22]:
df['cement'].max()

540.0

In [23]:
df.columns

Index(['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg',
       'fineagg', 'age', 'strength'],
      dtype='object')

In [11]:
from sklearn.preprocessing import MinMaxScaler
min_max_scale = MinMaxScaler()


std_cleaned_df = cleaned_df

std_cleaned_df["cement"] = min_max_scale.fit_transform(std_cleaned_df[["cement"]])

std_cleaned_df["slag"] = min_max_scale.fit_transform(std_cleaned_df[["slag"]])

std_cleaned_df["water"] = min_max_scale.fit_transform(std_cleaned_df[["water"]])

std_cleaned_df["ash"] = min_max_scale.fit_transform(std_cleaned_df[["ash"]])

std_cleaned_df["superplastic"] = min_max_scale.fit_transform(std_cleaned_df[["superplastic"]])

std_cleaned_df["coarseagg"] = min_max_scale.fit_transform(std_cleaned_df[["coarseagg"]])
std_cleaned_df["fineagg"] = min_max_scale.fit_transform(std_cleaned_df[["fineagg"]])
std_cleaned_df["age"] = min_max_scale.fit_transform(std_cleaned_df[["age"]])


std_cleaned_Y = std_cleaned_df[["strength"]]
std_cleaned_X = std_cleaned_df.drop(["strength"], axis = 1)

In [25]:
X_std_cleaned_train, X_std_cleaned_test, Y_std_cleaned_train, Y_std_cleaned_test = train_test_split(std_cleaned_X, std_cleaned_Y, test_size = 0.30, random_state = 1)

In [26]:
# X_std_cleaned_train, X_std_cleaned_test, Y_std_cleaned_train, Y_std_cleaned_test = train_test_split(std_cleaned_X, std_cleaned_Y, test_size = 0.30, random_state = 1)
for n in [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    model_knn = KNeighborsRegressor(n_neighbors=n, p=2, weights="distance")
    model_knn.fit(X_std_cleaned_train, Y_std_cleaned_train)
    print("\n Std + Outlier removed n_neighbors: usual p = 2", n)
    print("Train:: ",  model_knn.score(X_std_cleaned_train,Y_std_cleaned_train))
    print("Test:: ", model_knn.score(X_std_cleaned_test, Y_std_cleaned_test))


 Std + Outlier removed n_neighbors: usual p = 2 5
Train::  0.9946689474106676
Test::  0.8229192174687181

 Std + Outlier removed n_neighbors: usual p = 2 6
Train::  0.9946689474106676
Test::  0.8196802570864168

 Std + Outlier removed n_neighbors: usual p = 2 7
Train::  0.9946689474106676
Test::  0.8171742993961997

 Std + Outlier removed n_neighbors: usual p = 2 8
Train::  0.9946689474106676
Test::  0.8118692665260401

 Std + Outlier removed n_neighbors: usual p = 2 9
Train::  0.9946689474106676
Test::  0.800229095059186

 Std + Outlier removed n_neighbors: usual p = 2 10
Train::  0.9946689474106676
Test::  0.7989277223630249

 Std + Outlier removed n_neighbors: usual p = 2 11
Train::  0.9946689474106676
Test::  0.7980639598138253

 Std + Outlier removed n_neighbors: usual p = 2 12
Train::  0.9946689474106676
Test::  0.7887860822275625

 Std + Outlier removed n_neighbors: usual p = 2 13
Train::  0.9946689474106676
Test::  0.7807092356542273

 Std + Outlier removed n_neighbors: usual 

In [12]:

std_df = df

std_df["cement"] = min_max_scale.fit_transform(std_df[["cement"]])

std_df["slag"] = min_max_scale.fit_transform(std_df[["slag"]])

std_df["water"] = min_max_scale.fit_transform(std_df[["water"]])

std_df["ash"] = min_max_scale.fit_transform(std_df[["ash"]])

std_df["superplastic"] = min_max_scale.fit_transform(std_df[["superplastic"]])

std_df["coarseagg"] = min_max_scale.fit_transform(std_df[["coarseagg"]])
std_df["fineagg"] = min_max_scale.fit_transform(std_df[["fineagg"]])
std_df["age"] = min_max_scale.fit_transform(std_df[["age"]])

std_Y = std_df[["strength"]]
std_X = std_df.drop(["strength"], axis = 1)

In [28]:
X_std_train, X_std_test, Y_std_train, Y_std_test = train_test_split(std_X, std_Y, test_size = 0.30, random_state = 1)
for n in [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    model_knn = KNeighborsRegressor(n_neighbors=n, p=1)
    model_knn.fit(X_std_train, Y_std_train)
    print("\n Standarised + n_neighbors: usual p = 1", n)
    print("Train:: ",  model_knn.score(X_std_train,Y_std_train))
    print("Test:: ", model_knn.score(X_std_test, Y_std_test))


 Standarised + n_neighbors: usual p = 1 5
Train::  0.8325396649549555
Test::  0.7588758784232201

 Standarised + n_neighbors: usual p = 1 6
Train::  0.8202687071154866
Test::  0.7472197804340697

 Standarised + n_neighbors: usual p = 1 7
Train::  0.8074705138968455
Test::  0.7418474956890868

 Standarised + n_neighbors: usual p = 1 8
Train::  0.8025872720639196
Test::  0.7433254679320429

 Standarised + n_neighbors: usual p = 1 9
Train::  0.7981175595626173
Test::  0.7285744633683233

 Standarised + n_neighbors: usual p = 1 10
Train::  0.7892172834986473
Test::  0.7197742483073805

 Standarised + n_neighbors: usual p = 1 11
Train::  0.7828889256143912
Test::  0.7188062886671343

 Standarised + n_neighbors: usual p = 1 12
Train::  0.7733452384744721
Test::  0.7171657450599991

 Standarised + n_neighbors: usual p = 1 13
Train::  0.7634596887908844
Test::  0.710962923115712

 Standarised + n_neighbors: usual p = 1 14
Train::  0.7542203675298917
Test::  0.7037829582513133

 Standarised + 

In [29]:
cleaned_Y = cleaned_df[["strength"]]
cleaned_X = cleaned_df.drop(["strength"], axis = 1)
X_cleaned_train, X_cleaned_test, Y_cleaned_train, Y_cleaned_test = train_test_split(cleaned_X, cleaned_Y, test_size = 0.30, random_state = 1)

In [30]:
Y = df[["strength"]]
X = df.drop(["strength"], axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 1)

In [31]:
# Instantiate the model
model_lr = LinearRegression()
model_lr.fit(X_train, Y_train)
# Fit the model with the train data
print("Linear:: ")
print("Test::", model_lr.score(X_test, Y_test))
print("Train::", model_lr.score(X_train, Y_train))

Linear:: 
Test:: 0.7320709753755357
Train:: 0.7254766837389761


In [32]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

# Degree can be varied to play around and check when model is over fitting
# poly = PolynomialFeatures(degree = 1, interaction_only = True)
poly = PolynomialFeatures(degree = 2, interaction_only = True)
# poly = PolynomialFeatures(degree = 3, interaction_only = True)
# poly = PolynomialFeatures(degree = 4, interaction_only = True)

X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, Y_train)

y_pred = poly_clf.predict(X_test2)

print("Polynomial")
print("Train: ", poly_clf.score(X_train2, Y_train))
print("Test: ", poly_clf.score(X_test2, Y_test))

Polynomial
Train:  0.8016388489476829
Test:  0.7715736962399415


In [33]:
# e varied to play around and check when model is over fitting
# poly = PolynomialFeatures(degree = 1, interaction_only = True)
poly = PolynomialFeatures(degree = 2, interaction_only = True)
# poly = PolynomialFeatures(degree = 3, interaction_only = True)
# poly = PolynomialFeatures(degree = 4, interaction_only = True)

X_train2 = poly.fit_transform(X_cleaned_train)
X_test2 = poly.fit_transform(X_cleaned_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, Y_cleaned_train)

y_pred = poly_clf.predict(X_test2)

print("Polynomial + removed outliers")
print("Train: ", poly_clf.score(X_train2, Y_train))
print("Test: ", poly_clf.score(X_test2, Y_test))

Polynomial + removed outliers
Train:  0.8016388489476829
Test:  0.7715736962399415


In [34]:
model_lr = LinearRegression()
model_lr.fit(X_cleaned_train, Y_cleaned_train)
# Fit the model with the train data
print("Linear:: Cleaned data(removeoutlier) ")
print("Test::", model_lr.score(X_cleaned_test, Y_cleaned_test))
print("Train::", model_lr.score(X_cleaned_train, Y_cleaned_train))

Linear:: Cleaned data(removeoutlier) 
Test:: 0.7320709753755357
Train:: 0.7254766837389761


In [35]:
cleaned_df.dtypes

cement          float64
slag            float64
ash             float64
water           float64
superplastic    float64
coarseagg       float64
fineagg         float64
age             float64
strength        float64
dtype: object

In [36]:
from sklearn.preprocessing import MinMaxScaler
min_max_scale = MinMaxScaler()


std_cleaned_df = cleaned_df

std_cleaned_df["cement"] = min_max_scale.fit_transform(std_cleaned_df[["cement"]])

std_cleaned_df["slag"] = min_max_scale.fit_transform(std_cleaned_df[["slag"]])

std_cleaned_df["water"] = min_max_scale.fit_transform(std_cleaned_df[["water"]])

std_cleaned_df["ash"] = min_max_scale.fit_transform(std_cleaned_df[["ash"]])

std_cleaned_df["superplastic"] = min_max_scale.fit_transform(std_cleaned_df[["superplastic"]])

std_cleaned_df["coarseagg"] = min_max_scale.fit_transform(std_cleaned_df[["coarseagg"]])
std_cleaned_df["fineagg"] = min_max_scale.fit_transform(std_cleaned_df[["fineagg"]])
std_cleaned_df["age"] = min_max_scale.fit_transform(std_cleaned_df[["age"]])


Y = std_cleaned_df[["strength"]]
X = std_cleaned_df.drop(["strength"], axis = 1)

In [13]:
from sklearn.preprocessing import MinMaxScaler
min_max_scale = MinMaxScaler()


min_max_cleaned_df = cleaned_df

min_max_cleaned_df["cement"] = min_max_scale.fit_transform(min_max_cleaned_df[["cement"]])

min_max_cleaned_df["slag"] = min_max_scale.fit_transform(min_max_cleaned_df[["slag"]])

min_max_cleaned_df["water"] = min_max_scale.fit_transform(min_max_cleaned_df[["water"]])

min_max_cleaned_df["ash"] = min_max_scale.fit_transform(min_max_cleaned_df[["ash"]])

min_max_cleaned_df["superplastic"] = min_max_scale.fit_transform(min_max_cleaned_df[["superplastic"]])

min_max_cleaned_df["coarseagg"] = min_max_scale.fit_transform(min_max_cleaned_df[["coarseagg"]])
min_max_cleaned_df["fineagg"] = min_max_scale.fit_transform(min_max_cleaned_df[["fineagg"]])
min_max_cleaned_df["age"] = min_max_scale.fit_transform(min_max_cleaned_df[["age"]])


Y = min_max_cleaned_df[["strength"]]
X = min_max_cleaned_df.drop(["strength"], axis = 1)


In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 1)

# Instantiate the model
model_lr = LinearRegression()
model_lr.fit(X_train, Y_train)
# Fit the model with the train data
print("Linear + removed outliers + Normalization")
print("Test::", model_lr.score(X_test, Y_test))
print("Train::", model_lr.score(X_train, Y_train))

Linear + removed outliers + Normalization
Test:: 0.7320709753755357
Train:: 0.7254766837389761


In [39]:
poly = PolynomialFeatures(degree = 2, interaction_only = True)
# poly = PolynomialFeatures(degree = 3, interaction_only = True)
# poly = PolynomialFeatures(degree = 4, interaction_only = True)

X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, Y_train)

y_pred = poly_clf.predict(X_test2)

print("Polynomial + removed outliers + Normalization")
print("Train: ", poly_clf.score(X_train2, Y_train))
print("Test: ", poly_clf.score(X_test2, Y_test))

Polynomial + removed outliers + Normalization
Train:  0.8016388489476829
Test:  0.7715736962399415


In [40]:
from sklearn.svm import SVR


# kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable,          default='rbf'
#      Specifies the kernel type to be used in the algorithm.
#      If none is given, 'rbf' will be used. If a callable is given it is
#      used to precompute the kernel matrix.

# degree : int, default=3
#     Degree of the polynomial kernel function ('poly').
#     Must be non-negative. Ignored by all other kernels.

# gamma : {'scale', 'auto'} or float, default='scale'
#     Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.

#     - if ``gamma='scale'`` (default) is passed then it uses
#       1 / (n_features * X.var()) as value of gamma,
#     - if 'auto', uses 1 / n_features
#     - if float, must be non-negative.

#     .. versionchanged:: 0.22
#        The default value of ``gamma`` changed from 'auto' to 'scale'.

# coef0 : float, default=0.0
#     Independent term in kernel function.
#     It is only significant in 'poly' and 'sigmoid'.

# tol : float, default=1e-3
#     Tolerance for stopping criterion.

# C : float, default=1.0
#     Regularization parameter. The strength of the regularization is
#     inversely proportional to C. Must be strictly positive.
#     The penalty is a squared l2 penalty.

# epsilon : float, default=0.1
#      Epsilon in the epsilon-SVR model. It specifies the epsilon-tube
#      within which no penalty is associated in the training loss function
#      with points predicted within a distance epsilon from the actual
#      value. Must be non-negative.

# shrinking : bool, default=True
#     Whether to use the shrinking heuristic.
#     See the :ref:`User Guide <shrinking_svm>`.

# cache_size : float, default=200
#     Specify the size of the kernel cache (in MB).

# verbose : bool, default=False
#     Enable verbose output. Note that this setting takes advantage of a
#     per-process runtime setting in libsvm that, if enabled, may not work
#     properly in a multithreaded context.

# max_iter : int, default=-1
#     Hard limit on iterations within solver, or -1 for no limit.

from sklearn.svm import SVR

Y = min_max_cleaned_df[["strength"]]
X = min_max_cleaned_df.drop(["strength"], axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 1)


kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} 
model_svr = SVR()


for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    model_svr = SVR(kernel=kernel)
    model_svr.fit(X_std_train, Y_std_train)
    print("\n Kernel: ", kernel)
    print("train: ",model_svr.score(X_std_train, Y_std_train))
    print("test: ", model_svr.score(X_std_test, Y_std_test))
    
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    model_svr = SVR(kernel=kernel, gamma='auto')
    model_svr.fit(X_std_train, Y_std_train)
    print("\ngamma='auto')  Kernel: ", kernel)
    print("train: ",model_svr.score(X_std_train, Y_std_train))
    print("test: ", model_svr.score(X_std_test, Y_std_test))


 Kernel:  linear
train:  0.6698538127524506
test:  0.6855004765312129

 Kernel:  poly
train:  0.8529158603993915
test:  0.8243709276438238

 Kernel:  rbf
train:  0.7306301210396156
test:  0.7234106583659335

 Kernel:  sigmoid
train:  0.28664398738219343
test:  0.2675985638036016

gamma='auto')  Kernel:  linear
train:  0.6698538127524506
test:  0.6855004765312129

gamma='auto')  Kernel:  poly
train:  0.02530403129331882
test:  0.022501165661437006

gamma='auto')  Kernel:  rbf
train:  0.41966849737246925
test:  0.41980028215718235

gamma='auto')  Kernel:  sigmoid
train:  0.28595292730553934
test:  0.283931581703427


In [41]:
Y = std_cleaned_df[["strength"]]
X = std_cleaned_df.drop(["strength"], axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 1)


kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} 
model_svr = SVR()


for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    model_svr = SVR(kernel=kernel)
    model_svr.fit(X_std_train, Y_std_train)
    print("\n Kernel: ", kernel)
    print("train: ",model_svr.score(X_std_train, Y_std_train))
    print("test: ", model_svr.score(X_std_test, Y_std_test))
    
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    model_svr = SVR(kernel=kernel, gamma='auto')
    model_svr.fit(X_std_train, Y_std_train)
    print("\ngamma='auto')  Kernel: ", kernel)
    print("train: ",model_svr.score(X_std_train, Y_std_train))
    print("test: ", model_svr.score(X_std_test, Y_std_test))


 Kernel:  linear
train:  0.6698538127524506
test:  0.6855004765312129

 Kernel:  poly
train:  0.8529158603993915
test:  0.8243709276438238

 Kernel:  rbf
train:  0.7306301210396156
test:  0.7234106583659335

 Kernel:  sigmoid
train:  0.28664398738219343
test:  0.2675985638036016

gamma='auto')  Kernel:  linear
train:  0.6698538127524506
test:  0.6855004765312129

gamma='auto')  Kernel:  poly
train:  0.02530403129331882
test:  0.022501165661437006

gamma='auto')  Kernel:  rbf
train:  0.41966849737246925
test:  0.41980028215718235

gamma='auto')  Kernel:  sigmoid
train:  0.28595292730553934
test:  0.283931581703427


In [None]:
# std_cleaned_df, min_max_cleaned_df


# cleaned_df, df

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Y=cleaned_df["strength"]
X=cleaned_df.drop(columns=["strength"], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.30)

model=Sequential()
model.add(Dense(units=6,activation="relu",input_dim=8,kernel_initializer='uniform'))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=1))

model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy"])

model.fit(X_train, Y_train, batch_size=10, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13ffc0d10>

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Y=std_df["strength"]
X=std_df.drop(columns=["strength"], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.30)

model=Sequential()
model.add(Dense(units=6,activation="relu",input_dim=8,kernel_initializer='uniform'))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=1))

model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy"])

model.fit(X_train, Y_train, batch_size=10, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1419ca050>

In [73]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# std_cleaned_df, min_max_cleaned_df
Y=std_cleaned_df["strength"]
X=std_cleaned_df.drop(columns=["strength"], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=1)


model=Sequential()
model.add(Dense(units=6,kernel_initializer='uniform',activation="relu",input_dim=8))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=1))

model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy"])

model.fit(X_train, Y_train, batch_size=100, epochs=30, validation_split=0.20)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x148a07050>

In [74]:
from sklearn.metrics import r2_score
Y_pred = model.predict(X_train)
r2_score(Y_train, Y_pred)



0.09768937191782123

In [75]:
from sklearn.metrics import r2_score
Y_pred = model.predict(X_test)
r2_score(Y_test, Y_pred)



0.12099636988812912

In [67]:
Y=min_max_cleaned_df["strength"]
X=min_max_cleaned_df.drop(columns=["strength"], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.30)

model=Sequential()
model.add(Dense(units=6,activation="relu",input_dim=8,kernel_initializer='uniform'))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=20,kernel_initializer='uniform',activation="relu"))
model.add(Dense(units=1))

model.compile(optimizer="adam", loss="mean_squared_error")

model.fit(X_train, Y_train, batch_size=15, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x147e23390>

In [68]:
from sklearn.metrics import r2_score
Y_pred = model.predict(X_test)
r2_score(Y_test, Y_pred)



0.47719593570569063

In [69]:
from sklearn.metrics import r2_score
Y_pred = model.predict(X_train)
r2_score(Y_train, Y_pred)



0.48768416135090675