# SHOE BRANDS CAPSTONE PROJECT

By Subashree Rajkumar

# Phase 4

# MACHINE LEARNING MODELS FOR PREDICTION ANALYSIS

#Importing the Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR, SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load the dataset

In [2]:
data = pd.read_csv(f"table_all.csv", index_col=None)

# Catagorize the Datas

In [3]:
numerical_features = ['number_of_colors', 'price', 'number_of_sizes', 'reviews', 'comfort', 'quantification']
categorical_features = ['category', 'color_1', 'color_2', 'color_3', 'color_4', 'color_5', 'size', 'style_or_product_code']


# Separate features and labels for price and star rating

In [4]:
X = data[numerical_features + categorical_features]
y_price = data['price']
y_star_rating = data['stars']

# Split the data into train and test sets

In [5]:
X_train, X_test, y_price_train, y_price_test = train_test_split(X, y_price, test_size=0.2, random_state=42)
X_train, X_test, y_star_rating_train, y_star_rating_test = train_test_split(X, y_star_rating, test_size=0.2, random_state=42)


# Preprocessing pipeline

In [6]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 1. Prediction Model for PRICE

# Applying Various Machine learning Models

In [7]:
price_models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor()
}

In [8]:
best_price_model = None
best_price_rmse = float('inf')


for name, model in price_models.items():
    price_model = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', model)])
    price_model.fit(X_train, y_price_train)
    predictions = price_model.predict(X_test)
    rmse = mean_squared_error(y_price_test, predictions, squared=False)
    
    if rmse < best_price_rmse:
        best_price_rmse = rmse
        best_price_model = price_model
        best_price_model_name = name
    
    print(f"{name} has the RMSE of {rmse}")

Linear Regression has the RMSE of 45.83657523273772
Ridge Regression has the RMSE of 72.67207973134293
Random Forest Regressor has the RMSE of 159.6038770969396
Gradient Boosting Regressor has the RMSE of 74.08400970446675


# Best Price Prediction Model and Perform hyperparameter tuning

In [9]:
print("Best Price Prediction Model:", best_price_model_name)
print("Best Price Prediction RMSE:", best_price_rmse)

Best Price Prediction Model: Linear Regression
Best Price Prediction RMSE: 45.83657523273772


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt

# Define the models and their parameters
price_models = {
    'Linear Regression': [LinearRegression(), {'fit_intercept': [True, False]}],
    'Ridge Regression': [Ridge(), {'alpha': [0.1, 0.5, 1, 5, 10]}],
    'Random Forest Regressor': [RandomForestRegressor(), {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}],
    'Gradient Boosting Regressor': [GradientBoostingRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1], 'max_depth': [3, 5, 8]}]
}

# Perform hyperparameter tuning and calculate RMSE for each model
for model_name, [model, params] in price_models.items():
    # Create a pipeline that combines the preprocessor with the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Create a new param grid for the pipeline
    param_grid = {f'model__{key}': value for key, value in params.items()}
    
    # Perform hyperparameter tuning
    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid.fit(X_train, y_price_train)
    
    # Use the best model to predict on the test set
    best_preds = grid.predict(X_test)
    
    # Calculate RMSE for the best model
    best_rmse = sqrt(mean_squared_error(y_price_test, best_preds))
    
    # Print the best parameters and RMSE for each model
    print(f"Best parameters for {model_name}: ", grid.best_params_)
    print(f"Best {model_name} model has the RMSE of {best_rmse}")


Best parameters for Linear Regression:  {'model__fit_intercept': True}
Best Linear Regression model has the RMSE of 45.83657523273772
Best parameters for Ridge Regression:  {'model__alpha': 0.1}
Best Ridge Regression model has the RMSE of 48.77537829334586
Best parameters for Random Forest Regressor:  {'model__max_depth': 10, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best Random Forest Regressor model has the RMSE of 159.06485468787565
Best parameters for Gradient Boosting Regressor:  {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}
Best Gradient Boosting Regressor model has the RMSE of 47.65333844470699


# Save the predicted model

In [11]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])
param_grid = {f'model__fit_intercept': True}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
joblib.dump(grid, 'best_price_model.pkl')

['best_price_model.pkl']

INfERANCE
=======
These RMSE values represent the average deviation of predicted values from the actual values. Lower RMSE values indicate better performance.

For the price prediction, an RMSE of 45.84 suggests that, on average, the predicted prices differ from the actual prices by approximately $45.84.

The best performing model is the Linear regression model with a RMSE  of approximately 46%. The scores for both the training and testing data were similar, reducing concerns of the model being overfit.

# 2. Prediction Model for  Star Rating 

# Training the Dataset

In [12]:
df = pd.read_csv("table_all.csv", index_col=None)
import numpy as np
for column in df.columns:
    if df[column].dtype == object:
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        df[column].fillna(df[column].mean(), inplace=True)

le = LabelEncoder()
for column in df.columns:
    if df[column].dtype == object:
        df[column] = le.fit_transform(df[column])

scaler = StandardScaler()
for column in df.columns:
    if df[column].dtype == np.int64 or df[column].dtype == np.float64:
        df[column] = scaler.fit_transform(df[column].values.reshape(-1, 1))

X = df.drop('stars', axis=1)
y = df['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Applying Machine Learning Models

In [13]:
# Train models
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Gradient Boosting Tree
gbt = GradientBoostingRegressor(random_state=42)
gbt.fit(X_train, y_train)

# Predict on test set
lr_preds = lr.predict(X_test)
rf_preds = rf.predict(X_test)
gbt_preds = gbt.predict(X_test)

lr_rmse = mean_squared_error(y_test, lr_preds, squared=False)
rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)
gbt_rmse = mean_squared_error(y_test, gbt_preds, squared=False)

print(f"Linear Regression has the RMSE of {lr_rmse}")
print(f"Random Forest has the RMSE of {rf_rmse}")
print(f"Gradient Boosting Tree has the RMSE of {gbt_rmse}")


Linear Regression has the RMSE of 0.6366255102881898
Random Forest has the RMSE of 0.19705565455960955
Gradient Boosting Tree has the RMSE of 0.1820685375959569


# Hyperparameter Tuning

In [14]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their values
lr_params = {'fit_intercept': [True, False]}
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
gbt_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1], 'max_depth': [3, 5, 8]}

# Perform hyperparameter tuning on Linear Regression
lr_grid = GridSearchCV(lr, lr_params, cv=5, scoring='neg_mean_squared_error')
lr_grid.fit(X_train, y_train)

# Perform hyperparameter tuning on Random Forest
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='neg_mean_squared_error')
rf_grid.fit(X_train, y_train)

# Perform hyperparameter tuning on Gradient Boosting Tree
gbt_grid = GridSearchCV(gbt, gbt_params, cv=5, scoring='neg_mean_squared_error')
gbt_grid.fit(X_train, y_train)

# Print the best parameters
print("Best parameters for Linear Regression: ", lr_grid.best_params_)
print("Best parameters for Random Forest: ", rf_grid.best_params_)
print("Best parameters for Gradient Boosting Tree: ", gbt_grid.best_params_)


Best parameters for Linear Regression:  {'fit_intercept': True}
Best parameters for Random Forest:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best parameters for Gradient Boosting Tree:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


# Predicting the best model

In [15]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Use the best models to predict on the test set
lr_best_preds = lr_grid.predict(X_test)
rf_best_preds = rf_grid.predict(X_test)
gbt_best_preds = gbt_grid.predict(X_test)

# Calculate RMSE for the best models
lr_best_rmse = sqrt(mean_squared_error(y_test, lr_best_preds))
rf_best_rmse = sqrt(mean_squared_error(y_test, rf_best_preds))
gbt_best_rmse = sqrt(mean_squared_error(y_test, gbt_best_preds))

# Print RMSE for the best models
print(f"Best Linear Regression model has the RMSE of {lr_best_rmse}")
print(f"Best Random Forest model has the RMSE of {rf_best_rmse}")
print(f"Best Gradient Boosting Tree model has the RMSE of {gbt_best_rmse}")


Best Linear Regression model has the RMSE of 0.6366255102881898
Best Random Forest model has the RMSE of 0.19582583194183356
Best Gradient Boosting Tree model has the RMSE of 0.18044048049973138


In [16]:
print("Best Price Prediction Model: Gradient Boosting Tree")
print("Best Price Prediction RMSE:", gbt_best_rmse)

Best Price Prediction Model: Gradient Boosting Tree
Best Price Prediction RMSE: 0.18044048049973138


# Save the model

In [17]:
joblib.dump(gbt_best_rmse, 'best_star_rating_model.pkl')

['best_star_rating_model.pkl']

Inferance
=========

Regarding star rating prediction 

The best performing model is the Gradient boosting Tree with an RMSE of 0.180 suggests that, on average, the predicted star ratings differ from the actual star ratings by approximately 0.180 stars


