In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv("concrete.csv")

In [3]:
df.head()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,141.3,212.0,0.0,203.5,0.0,971.8,748.5,28,29.89
1,168.9,42.2,124.3,158.3,10.8,1080.8,796.2,14,23.51
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,28,29.22
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,154.8,183.4,0.0,193.3,9.1,1047.4,696.7,28,18.29


In [4]:
df.isnull().sum()

cement          0
slag            0
ash             0
water           0
superplastic    0
coarseagg       0
fineagg         0
age             0
strength        0
dtype: int64

In [5]:
df.dtypes

cement          float64
slag            float64
ash             float64
water           float64
superplastic    float64
coarseagg       float64
fineagg         float64
age               int64
strength        float64
dtype: object

In [6]:
x=df.drop(["strength"],axis=1)
y=df["strength"]

In [35]:
m=RandomForestRegressor()
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,test_size=0.3,random_state=1)
m.fit(x_train,y_train)
m.score(x_train,y_train)


0.9826888181721526

In [36]:
m.score(x_test,y_test)

0.9049872683926213

In [37]:
m1=DecisionTreeRegressor(min_samples_split=3)
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,test_size=0.3,random_state=1)
m1.fit(x_train,y_train)
m1.score(x_train,y_train)


0.9916327644250145

In [38]:
m1.score(x_test,y_test)

0.8572360461460773

In [11]:
from sklearn.model_selection import GridSearchCV


In [12]:
param_grid={
    'bootstrap':[True],
    'max_depth':[5,6],
    'max_features':[2,3],
    'min_samples_leaf':[3,4],
    'min_samples_split':[5,10],
    'n_estimators':[5,6,7]
}

In [13]:
rf=RandomForestRegressor(random_state=1)

In [14]:
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,return_train_score=True)

In [15]:
grid_search.fit(x_train,y_train)

In [16]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 6,
 'max_features': 3,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 7}

In [17]:
best_grid=grid_search.best_estimator_
best_grid.score(x_test,y_test)

0.8095950168104953

In [18]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10 , stop = 15, num = 2)]   # returns evenly spaced 10 numbers
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 2)]  # returns evenly spaced numbers can be changed to any
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]


In [19]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)


{'n_estimators': [10, 15], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [20]:
# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 5, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(x_train, y_train);


Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [21]:
rf_random.best_params_

{'n_estimators': 15,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [22]:
best_random=rf_random.best_estimator_
best_random.score(x_test,y_test)


0.8874523985744011

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from  sklearn.pipeline import Pipeline

In [24]:
pipe_lr=Pipeline([('scl',StandardScaler()),('pca',PCA(n_components=2)),('rf',RandomForestRegressor(random_state=1)),])
pipe_lr.fit(x_train,y_train)
print('test accuracy:' ,pipe_lr.score(x_test,y_test))

test accuracy: 0.2346754664642483


or

In [25]:
pipe_lr=Pipeline([('scl',StandardScaler()),('rf',RandomForestRegressor(random_state=1)),])
pipe_lr.fit(x_train,y_train)
print('test accuracy:' ,pipe_lr.score(x_test,y_test))

test accuracy: 0.9074778898107488


In [26]:
pipe_lr.score(x_train,y_train)

0.9828342564295965

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

pipe_svc=Pipeline([('scl',StandardScaler()),('svc',SVR())])
param_grid={'svc__C':[0.001,0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100],
           }

In [28]:
grid=GridSearchCV(pipe_svc,param_grid=param_grid,cv=5)
grid.fit(x_train,y_train)

In [29]:
print(" Best cross-validation accuracy: {:.2f}". format( grid.best_score_)) 
print(" Best parameters: ", grid.best_params_) 
print(" Test set accuracy: {:.2f}". format( grid.score( x_test, y_test)))


 Best cross-validation accuracy: 0.85
 Best parameters:  {'svc__C': 100, 'svc__gamma': 0.1}
 Test set accuracy: 0.86


In [30]:
grid.score(x_train,y_train)

0.9180840124572544

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

pipe_sv=Pipeline([('scl',StandardScaler()),('sv',DecisionTreeRegressor())])
param_grid={'sv__max_depth':[1,2,3,4,5,6,7,8,10,11],
           'sv__max_features':[1,2,3,4,5,6,7,8],
           }

In [32]:
grid=GridSearchCV(pipe_sv,param_grid=param_grid,cv=5)
grid.fit(x_train,y_train)

In [33]:
print(" Best cross-validation accuracy: {:.2f}". format( grid.best_score_)) 
print(" Best parameters: ", grid.best_params_) 
print(" Test set accuracy: {:.2f}". format( grid.score( x_test, y_test)))


 Best cross-validation accuracy: 0.81
 Best parameters:  {'sv__max_depth': 8, 'sv__max_features': 7}
 Test set accuracy: 0.80


In [34]:
grid.score(x_train,y_train)

0.9531346992781919