<a href="https://colab.research.google.com/github/Umerfarooq122/Using-Predictive-analytics-to-predict-PH-of-beverages/blob/main/Using_Predictive_analytics_to_predict_PH_of_beverages.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing the required libraries:**

In [None]:
! pip install tensorflow keras torch torchvision
! pip install scikit-learn xgboost lightgbm pyearth

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score , mean_squared_error, mean_absolute_error, make_scorer, r2_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.cross_decomposition import PLSRegression
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# **Loading the data set:**

In [3]:
X_train = pd.read_csv("https://raw.githubusercontent.com/NickAMC/NickAMC.github.io/main/DATA_624_S24/project_2/X_train.csv")
X_test = pd.read_csv("https://raw.githubusercontent.com/NickAMC/NickAMC.github.io/main/DATA_624_S24/project_2/X_test.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/NickAMC/NickAMC.github.io/main/DATA_624_S24/project_2/y_train.csv")
y_test = pd.read_csv("https://raw.githubusercontent.com/NickAMC/NickAMC.github.io/main/DATA_624_S24/project_2/y_test.csv")

X_train = X_train.values
y_train = y_train['PH'].values
y_test = y_test['PH'].values
X_test = X_test.values

## **Setting up the dataframe for model metrics:**

In [4]:
metrics_df = pd.DataFrame(columns=['RMSE', 'MAE', 'R_squared'])

# **Modeling:**

## **Linear Regression:**

In [5]:
param_gridl = {
    'fit_intercept': [True, False],
    'positive': [True, False],
    'copy_X': [True, False]
}
linear_model = LinearRegression()
grid_search = GridSearchCV(estimator=linear_model, param_grid=param_gridl, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [6]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

train_mae = -np.mean(grid_search.cv_results_['mean_test_score'])
train_rmse = np.sqrt(-np.mean(grid_search.cv_results_['mean_test_score']))

print("Training RMSE:", train_rmse)
print("Training MAE:", train_mae)

Training RMSE: 0.14221387536832125
Training MAE: 0.02022478634727641


In [7]:
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'copy_X': True, 'fit_intercept': False, 'positive': False}


In [8]:
best_model.fit(X_train, y_train)
y_predl = best_model.predict(X_test)
l_rmse = np.sqrt(mean_squared_error(y_test, y_predl))
l_mae = mean_absolute_error(y_test, y_predl)
lr_squared = r2_score(y_test, y_predl)
metrics_df.loc['Linear Regression'] = [l_rmse,l_mae,lr_squared]



## **Partial Least Sqaures:**


In [9]:
param_gridp = {
    'n_components': list(range(1, 26))
}
pls_model = PLSRegression()
grid_searchp = GridSearchCV(estimator=pls_model, param_grid=param_gridp, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_searchp.fit(X_train, y_train)
best_modelp = grid_searchp.best_estimator_
best_params = grid_searchp.best_params_
print("Best Hyperparameters:", best_params)
best_modelp.fit(X_train, y_train)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Hyperparameters: {'n_components': 7}


In [10]:
y_predp = best_modelp.predict(X_test)
p_rmse = np.sqrt(mean_squared_error(y_test, y_predp))
p_mae = mean_absolute_error(y_test, y_predp)
pr_squared = r2_score(y_test, y_predp)


metrics_df.loc['Partial least Squares'] = [p_rmse,p_mae,pr_squared]




## **Support Vector Machines (SVM):**

In [11]:
param_grids = {
    'kernel': ['linear', 'rbf'],  # Kernel type
    'C': [0.1, 1, 10],             # Regularization parameter
    'epsilon': [0.1, 0.2, 0.5]     # Epsilon parameter for epsilon-SVR
}

# Initialize SVM model for regression
svm_regressor = SVR()

# Define custom scorer for GridSearchCV (negative mean squared error)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform grid search with cross-validation
grid_searchs = GridSearchCV(estimator=svm_regressor, param_grid=param_grids, cv=5, scoring=scorer, verbose=1)
grid_searchs.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_modelss = grid_searchs.best_estimator_
best_paramss = grid_searchs.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters: {'n_components': 7}


In [12]:
best_modelss.fit(X_train, y_train)
y_preds = best_modelss.predict(X_test)
s_rmse = np.sqrt(mean_squared_error(y_test, y_preds))
s_mae = mean_absolute_error(y_test, y_preds)
sr_squared = r2_score(y_test, y_preds)

In [13]:
metrics_df.loc['Support Vector Machines'] = [s_rmse,s_mae,sr_squared]


In [14]:
print(p_rmse)
print(s_rmse)
print(l_rmse)

0.13683418110117496
0.12386534300134637
0.13685602100680846


## **Random Forest:**

In [15]:
param_gridr = {
    'n_estimators': [50, 100, 200],      # Number of trees in the forest
    'max_depth': [None, 10, 20],         # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],     # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]        # Minimum number of samples required to be at a leaf node
}

In [16]:
rf_regressor = RandomForestRegressor(random_state=42)
scorer = make_scorer(mean_squared_error, greater_is_better=False)


grid_searchr = GridSearchCV(estimator=rf_regressor, param_grid=param_gridr, cv=5, scoring=scorer, verbose=1)
grid_searchr.fit(X_train, y_train)


best_modelr = grid_searchr.best_estimator_
best_paramsr = grid_searchr.best_params_

print("Best Hyperparameters:", best_params)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Hyperparameters: {'n_components': 7}


In [17]:
best_modelr.fit(X_train, y_train)
y_predr = best_modelr.predict(X_test)
r_rmse = np.sqrt(mean_squared_error(y_test, y_predr))
r_mae = mean_absolute_error(y_test, y_predr)
rr_squared = r2_score(y_test, y_predr)

In [18]:
metrics_df.loc['Random Forest'] = [r_rmse,r_mae,rr_squared]

## **XGBoost:**

In [20]:
param_gridx = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

In [21]:
xgb_regressor = XGBRegressor()
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_searchx = GridSearchCV(estimator=xgb_regressor, param_grid=param_gridx, cv=5, scoring=scorer, verbose=1)
grid_searchx.fit(X_train, y_train)
best_modelx = grid_searchx.best_estimator_
best_paramsx = grid_searchx.best_params_

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [22]:
best_modelx.fit(X_train, y_train)
y_predx = best_modelx.predict(X_test)
x_rmse = np.sqrt(mean_squared_error(y_test, y_predx))
x_mae = mean_absolute_error(y_test, y_predx)
xr_squared = r2_score(y_test, y_predx)

In [23]:
metrics_df.loc['XGBoost'] = [x_rmse,x_mae,xr_squared]

## **Neural Network:**

In [25]:
param_gridn = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],  # Size of hidden layers
    'activation': ['relu', 'tanh'],                   # Activation function
    'solver': ['adam', 'sgd'],                        # Solver for weight optimization
    'learning_rate': ['constant', 'adaptive']         # Learning rate schedule
}

In [26]:

mlp_regressor = MLPRegressor(max_iter=1000, tol=1e-4, solver='adam')
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform grid search with cross-validation
grid_searchn = GridSearchCV(estimator=mlp_regressor, param_grid=param_gridn, cv=5, scoring=scorer, verbose=1)
grid_searchn.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_modeln = grid_searchn.best_estimator_
best_paramsn = grid_searchn.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [28]:
print("Best Hyperparameters:", best_paramsn)

# Make predictions on the test set
y_predn = best_modeln.predict(X_test)

# Calculate RMSE on the test set
n_rmse = np.sqrt(mean_squared_error(y_test, y_predn))
n_mae = mean_absolute_error(y_test, y_predn)
nr_squared = r2_score(y_test, y_predn)

Best Hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}


In [29]:
metrics_df.loc['Neural Network'] = [n_rmse,n_mae,nr_squared]

# **Evaluation:**

In [30]:
metrics_df

Unnamed: 0,RMSE,MAE,R_squared
Linear Regression,0.136856,0.105798,0.376012
Partial least Squares,0.136834,0.105861,0.376211
Support Vector Machines,0.123865,0.09663,0.488851
Random Forest,0.104344,0.075822,0.637268
XGBoost,0.103316,0.078006,0.644382
Neural Network,0.176813,0.135353,-0.041539
