## Predictive Modeling on Processed Data

Please refer the project summary document for better understanding of the code below

In [1]:
# Importing needed libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Processed_CementData.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 53 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Material Quantity (gm)                             6040 non-null   float64
 1   Additive Catalyst (gm)                             6040 non-null   float64
 2   Ash Component (gm)                                 6040 non-null   float64
 3   Water Mix (ml)                                     6040 non-null   float64
 4   Plasticizer (gm)                                   6040 non-null   float64
 5   Moderate Aggregator                                6040 non-null   float64
 6   Refined Aggregator                                 6040 non-null   float64
 7   Formulation Duration (hrs)                         6040 non-null   float64
 8   Compression Strength MPa                           6040 non-null   float64
 9   Material

In [3]:
X = df.drop('Compression Strength MPa', axis=1)
y = df[['Compression Strength MPa']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [4]:
# In the EDA phase we didn't find any pattern which means may be tha data is overfitting or has skewness
#because of adding 10 rows by data augmentation technique. Need to identify it first. 

model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [5]:
# Evaluate the model on the test set

test_score = model.score(X_test,y_test)
print(f"Test R-squared score : {test_score}")

Test R-squared score : 0.280222173125403


In [6]:
# Using cross validation to get a more robust estimate of model performance

cv_scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-Validation R-squared scores : {cv_scores} \nAverage cross-validation R-squared score : {cv_scores.mean()}")

Cross-Validation R-squared scores : [0.27453938 0.24019837 0.19213068 0.27077684 0.27492728] 
Average cross-validation R-squared score : 0.25051451086395515


In [7]:
# As the R2 Score is very low i.e. 25.05% avg, we need to apply regularization 

Lasso_model = Lasso(alpha=0.3)
Lasso_model.fit(X_train,y_train)

Lasso(alpha=0.3)

In [8]:
Lasso_R2 = Lasso_model.score(X_test,y_test)
print(f"Lasso R2 Score : {Lasso_R2}")

Lasso R2 Score : 0.2781084194046476


In [9]:
cv_scores = cross_val_score(Lasso_model, X_train, y_train, cv=5)
print(f"Lasso CV Scores : {cv_scores}\nAverage CV Score : {cv_scores.mean()}")

Lasso CV Scores : [0.24181576 0.21336275 0.25387504 0.24821232 0.25168171]
Average CV Score : 0.24178951791137662


In [10]:
# Applying Ridge regularization

Ridge_model = Ridge(alpha=0.4)
Ridge_model.fit(X_train, y_train)

Ridge(alpha=0.4)

In [11]:
Ridge_R2 = Ridge_model.score(X_test, y_test)
print(f"Ridge R2 Score : {Ridge_R2}")

Ridge R2 Score : 0.2801753627256778


In [12]:
cv_scores = cross_val_score(Ridge_model, X_train, y_train, cv=5)
print(f"Ridge CV Score : {cv_scores}\nAverage CV score : {cv_scores.mean()}")

Ridge CV Score : [0.24458255 0.21144036 0.25564342 0.24747709 0.25255024]
Average CV score : 0.24233873272556875


In [13]:
#Still the cv score is very low then we can apply Feature scaling technique on the data first and then start modeling.
#Normalizing data

scaler = MinMaxScaler()
IV_normalized = scaler.fit_transform(X)
IV_normalized

array([[0.67740849, 0.41188679, 0.08708832, ..., 0.92341729, 0.89150948,
        0.82014303],
       [0.01661801, 0.5932903 , 0.76028183, ..., 0.73050234, 0.04021267,
        0.00394133],
       [0.81504978, 0.00647707, 0.45780764, ..., 0.53331019, 0.53493921,
        0.38958829],
       ...,
       [0.21265364, 0.4376084 , 0.27677314, ..., 0.56087563, 0.28963643,
        0.1222591 ],
       [0.44687572, 0.62738082, 0.88699361, ..., 0.59619893, 0.60557985,
        0.47296076],
       [0.27325606, 0.80178338, 0.00953846, ..., 0.38394733, 0.51792918,
        0.40748354]])

In [14]:
y.shape

(6040, 1)

In [15]:
dv_normalized = scaler.fit_transform(y)
dv_normalized

array([[0.8596218 ],
       [0.63615128],
       [0.83704116],
       ...,
       [0.62507122],
       [0.40795873],
       [0.56518272]])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(IV_normalized, dv_normalized, test_size = 0.2, random_state = 48)

In [17]:
# Creating and using SVR model

svr_model = SVR(kernel="linear")
svr_model.fit(X_train, y_train)

SVR(kernel='linear')

In [18]:
y_pred_lsvm = svr_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_lsvm)
r2 = r2_score(y_test, y_pred_lsvm)
print(f"Mean Squared Error (MSE) : {mse}\nR2 Score : {r2}")

Mean Squared Error (MSE) : 0.025228843287229914
R2 Score : 0.2833690978278728


In [19]:
svr_model = SVR(kernel="rbf", degree=2, C=0.1)
svr_model.fit(X_train, y_train)

SVR(C=0.1, degree=2)

In [20]:
y_pred_rsvm = svr_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_rsvm)
r2 = r2_score(y_test, y_pred_rsvm)
print(f"Mean Squared Error (MSE) : {mse}\nR2 Score : {r2}")

Mean Squared Error (MSE) : 0.023377870516411988
R2 Score : 0.33594639087082057


In [21]:
abs_error = np.abs(y_test - y_pred_rsvm)
print(abs_error.mean())

0.16999609317968745


In [22]:
svr_model = SVR(kernel="poly", degree=4, C=0.1)
svr_model.fit(X_train, y_train)

SVR(C=0.1, degree=4, kernel='poly')

In [23]:
y_pred_psvm = svr_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_psvm)
r2 = r2_score(y_test, y_pred_psvm)
print(f"Mean Squared Error (MSE) : {mse}\nR2 Score : {r2}")

Mean Squared Error (MSE) : 0.027737271821694613
R2 Score : 0.21211662766021977


In [24]:
# Applying different Boosting regression Model.

# Gradient Boosting Regression

gb_model = GradientBoostingRegressor()

gb_model.fit(X_train, y_train)

GradientBoostingRegressor()

In [25]:
y_pred_gb = gb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_gb)
r2 = r2_score(y_test, y_pred_gb)
print(f"Mean Squared Error (MSE) : {mse}\nR2 Score : {r2}")

Mean Squared Error (MSE) : 0.021055022062925314
R2 Score : 0.40192741758217476


In [26]:
# Hyperparameter tuning in the gradient boosting regression model as we have recieved highest R2 score up till now i.e. 0.40.

param_grid = {
    'n_estimators' : [50, 100, 200],
    'learning_rate' : [0.01, 0.1, 0.5],
    'max_depth' : [3, 4, 5],
    'min_samples_split' : [2, 5, 10]
}

# Perform grid search with cross validation
GS = GridSearchCV(estimator = gb_model, param_grid = param_grid, cv = 5)
GS.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(),
             param_grid={'learning_rate': [0.01, 0.1, 0.5],
                         'max_depth': [3, 4, 5],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]})

In [27]:
# Getting best hyperparameters and model
best_params = GS.best_params_
best_model = GS.best_estimator_

print(f"Best Hperparameters : {best_params}\nBest Model : {best_model}")

Best Hperparameters : {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 50}
Best Model : GradientBoostingRegressor(max_depth=5, min_samples_split=10, n_estimators=50)


In [28]:
y_pred_gb2 = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_gb2)
r2 = r2_score(y_test, y_pred_gb2)
print(f"Mean Squared Error (MSE) : {mse}\nR2 Score : {r2}")

Mean Squared Error (MSE) : 0.02055949672668214
R2 Score : 0.41600292491789304


In [29]:
# Applying Bagging technique using Random Forest Model.

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

RandomForestRegressor()

In [30]:
y_pred_rf = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)
print(f"Mean Squared Error (MSE) : {mse}\nR2 Score : {r2}")

Mean Squared Error (MSE) : 0.021200001515490702
R2 Score : 0.39780924400182094


In [32]:
# Applying Hyperparameter tuning on random Forest model

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],        # Number of trees in the forest
    'max_features': ['auto', 'sqrt'],      # Number of features to consider when looking for the best split
    'max_depth': [None, 10, 20],           # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]          # Minimum number of samples required to be at a leaf node
}

# Perform the grid search with cross validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [None, 10, 20],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]},
             scoring='r2')

In [33]:
# Get the best hyperparameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best Hperparameters : {best_params}\nBest Model : {best_model}")

Best Hperparameters : {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Model : RandomForestRegressor(max_depth=10, max_features='sqrt', n_estimators=200)


In [34]:
y_pred_rf2 = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_rf2)
r2 = r2_score(y_test, y_pred_rf2)
print(f"Mean Squared Error (MSE) : {mse}\nR2 Score : {r2}")

Mean Squared Error (MSE) : 0.020158684593549348
R2 Score : 0.4273880826636657
