In [69]:
# Import Panda libaries
import pandas as pd
import numpy as np
from src.exception import CustomException
from src.logger import logging
import warnings
warnings.filterwarnings("ignore")

Model Training

In [70]:

#loading the dateset
df = pd.read_csv('./data/cement_EDA_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0,76.885537
1,1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0,61.887366
2,2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,129.5,40.269535
3,3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,129.5,41.05278
4,4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,129.5,44.296075


In [71]:
X=df.drop(columns="Concrete compressive strength(MPa, megapascals) ", axis= 1)
Y=df["Concrete compressive strength(MPa, megapascals) "]

In [72]:
## Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=42)

In [73]:
## Train multiple models
## Model Evaluation
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [80]:

# Define the preprocessing pipeline
preprocessor = make_pipeline(KNNImputer(n_neighbors=3), StandardScaler())

In [81]:
# Define the models with their respective hyperparameters to tune
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1),
    'Lasso Regression': Lasso(alpha=1),
    'Random Forest Regression': RandomForestRegressor(n_estimators=100,min_samples_split=2), # Reference form EDA file
    'Gradient Boosting Regression': GradientBoostingRegressor(max_depth=5,learning_rate= 0.1, n_estimators= 200)
}

In [76]:
#Evaluating the  Model creating function
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [93]:
# Fit and evaluate each model

for model_name, model in models.items():
    # Create the pipeline
    pipeline = make_pipeline(preprocessor, model)
    
    # Fit the pipeline to the training data
    pipeline.fit(X_train, y_train)
    
    #Make Predictions
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the pipeline on the test data
    mae, rmse, r2_square=evaluate_model(y_test,y_pred)
    
    # Print the mean squared error
    print('Model Training Performance :',model_name)
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)
   
    print('='*50)
    print('\n')

Model Training Performance : Linear Regression
RMSE: 9.40892479455184
MAE: 7.2045670248177585
R2 score 68.61612415750815


Model Training Performance : Ridge Regression
RMSE: 9.40382647339022
MAE: 7.20118325988456
R2 score 68.6501262845627


Model Training Performance : Lasso Regression
RMSE: 10.00385955058522
MAE: 7.9228625071081
R2 score 64.52178557718757


Model Training Performance : Random Forest Regression
RMSE: 5.278037412525408
MAE: 3.720820513152517
R2 score 90.12421762413828


Model Training Performance : Gradient Boosting Regression
RMSE: 4.477803466572008
MAE: 3.0651224338345755
R2 score 92.8918490991003


