# E-commerce Product Recommendation System

- Name: Nyein Chan Aung
- Student ID: st125553

In [None]:
# connect with google
from google.colab import drive
drive.mount('/content/drive')

# Mount drive folder with os
import os
os.chdir('/content/drive/MyDrive/CP4DSAI/_Project/_FinalProject/')

## import required library

In [None]:
#Library for edit dataset
import pandas as pd
import numpy as np
import datetime as dp

#Library for visualization
import seaborn as sns
import matplotlib.pyplot as plt

!pip install plotly

import plotly.graph_objects as go
from plotly.subplots import make_subplots


!pip install folium
import folium
from folium.plugins import StripePattern
import branca.colormap
from collections import defaultdict
from folium.plugins import HeatMap

#Calculate distance on latitude and longitude
from math import radians, cos, sin, asin, sqrt

#Library to find correlation in categorical data
from pandas import factorize

# Modeling
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.utils import resample

import matplotlib.pyplot as plt
# !pip install squarify
import squarify
warnings.filterwarnings('ignore')

### Load Dataset

In [None]:
cleaned_df = pd.read_csv('data/_cleaned_df.csv')
cleaned_df.head()

In [None]:
cleaned_df.info()

In [None]:
len(cleaned_df.cust_id.unique())

In [None]:
seg_user_df = pd.read_csv('data/_user_segmentation.csv')
seg_user_df = seg_user_df.drop(columns=['Unnamed: 0'])
seg_user_df.count()

## Feature Selection

In [None]:
# generate pivot table for category with total number of sale


In [None]:
# generate the chart to find the relation between discount per cent and total sales in clened_df

plt.figure(figsize=(10, 6))
plt.scatter(cleaned_df.total, cleaned_df.discount_percent, alpha=0.5)
plt.title('Total Amount and Discount Percent')
plt.xlabel('Total Amount (usd$)')
plt.ylabel('Discount Percent (%)')
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set up the figure
plt.figure(figsize=(12, 6))

# Violin plot to visualize density and distribution of discounts
sns.violinplot(data=cleaned_df, x=cleaned_df.category, y=cleaned_df.discount_percent, palette="Set3")
plt.title('Violin Plot of Discount Percentages by Category')
plt.xlabel('Category')
plt.ylabel('Discount Percent (%)')
plt.xticks(rotation=45)
plt.tight_layout()
# Show plots

plt.show()


In [None]:
# count the sold category by payment
payment_category_count = cleaned_df.groupby('payment_method')['category'].value_counts()

# generate multivariate plot for payment category count
payment_category_count.unstack().plot(kind='line', stacked=True, figsize=(10, 6))


In [None]:
# count the sold category by state
state_category_count = cleaned_df.groupby('state')['category'].value_counts()

# generate multivariate plot for state category count
state_category_count.unstack().plot(kind='line', stacked=True, figsize=(12, 6))

In [None]:
# count the sold category by gender
gender_category_count = cleaned_df.groupby('gender')['category'].value_counts()

# generate plot for gender_category_count
gender_category_count.unstack().plot(kind='bar', stacked=True, figsize=(10, 6))

In [None]:
# generate grap for age
age_category_count = cleaned_df.groupby('age')['category'].value_counts()

# generate plot for gender_category_count
age_category_count.unstack().plot(kind='line', stacked=True, figsize=(10, 6))


According to the analysis, the following features are affected by the sale of category
- Age
- Payment Method
- State
- Discount Percent
- P.s. There is no significant varient in Gender

In [None]:
cleaned_df.category.unique()

In [None]:
# Merge cluster data and order data
merged_df = pd.merge(cleaned_df, seg_user_df, on='cust_id', how='left')
merged_df.head()

In [None]:
merged_df.k_means_segment.unique()

In [None]:
# set filter data with selected features
merged_df = merged_df.filter(['cust_id', 'age', 'payment_method', 'state', 'discount_percent', 'k_means_segment', 'category'])
merged_df.head()


## Model Training

In [None]:
merged_df.isna().sum()

In [None]:
merged_df.cust_id.value_counts()

In [None]:
merged_df.payment_method.value_counts()

In [None]:
merged_df.state.value_counts()

In [None]:
len(merged_df.state.unique())

In [None]:
merged_df.discount_percent.value_counts()

In [None]:
merged_df.k_means_segment.unique()

In [None]:
merged_df.age.value_counts()

In [None]:
merged_df.info()

#### Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

Payment_encoder = LabelEncoder()
State_encoder = LabelEncoder()
Category_encoder = LabelEncoder()
K_Mean_encoder = LabelEncoder()

Percent_scaler = MinMaxScaler()

merged_df['payment_method'] = Payment_encoder.fit_transform(merged_df['payment_method'])
merged_df['state'] = State_encoder.fit_transform(merged_df['state'])
merged_df['k_means_segment'] = K_Mean_encoder.fit_transform(merged_df['k_means_segment'])
merged_df['discount_percent'] = Percent_scaler.fit_transform(merged_df[['discount_percent']])
merged_df['category'] = Category_encoder.fit_transform(merged_df['category'])

merged_df


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# split data into X and y
X = merged_df.drop(columns=['cust_id', 'category'])
X

In [None]:
y = merged_df['category']
y

### Adjust class imbalance by SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

print("Class distribution before oversampling:", y.value_counts())

In [None]:
# Apply SMOTE to oversample the minority class (class 5)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Count the class distribution after oversampling
print("Class distribution after oversampling:",  y_resampled.value_counts())

#### Split and train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, train_size=0.8, random_state=42)
X_train

In [None]:
y_train

In [None]:
y_train.value_counts()

In [None]:
# from sklearn.model_selection import GridSearchCV, train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.metrics import classification_report
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# import pandas as pd


# # Split the dataset
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# # Define models and hyperparameters
# param_grid = {
#     # 'LogisticRegression': {
#     #     'model': [LogisticRegression(max_iter=500)],
#     #     'model__C': [0.1, 1, 10],
#     #     'model__penalty': ['l2']
#     # },
#     # 'RandomForest': {
#     #     'model': [RandomForestClassifier(random_state=42)],
#     #     'model__n_estimators': [100, 200, 300],
#     #     'model__max_depth': [None, 10, 20],
#     #     'model__min_samples_split': [2, 5]
#     # },
#     'SVC': {
#         'model': [SVC()],
#         'model__C': [0.1, 1, 10],
#         'model__kernel': ['linear', 'rbf']
#     },
#     'XGBClassifier': {
#         'model': [XGBClassifier(random_state=42)],
#         'model__learning_rate': [0.01, 0.1, 0.2],
#         'model__n_estimators': [100, 200],
#         'model__max_depth': [3, 5, 7]
#     },

#     'LightGBM': {
#         'pipeline__model': [LGBMClassifier(random_state=42)],
#         'pipeline__model__learning_rate': [0.01, 0.1],
#         'pipeline__model__n_estimators': [50, 100],
#         'pipeline__model__max_depth': [5, 10],
#     },
# }

# # Loop through the parameter grid and find the best model
# best_model = None
# best_params = None
# best_score = 0
# results = []

# for model_name, grid in param_grid.items():
#     print(f"Running GridSearchCV for {model_name}...")
#     pipeline = Pipeline([
#         ('scaler', StandardScaler()),  # Add scaling for algorithms sensitive to feature magnitude
#         ('model', grid['model'][0])   # Placeholder for the model
#     ])

#     # Update grid keys to match pipeline parameters
#     grid_search = GridSearchCV(pipeline, param_grid={
#         f"model__{k.split('__')[1]}": v for k, v in grid.items() if k.startswith('model__')
#     }, cv=5, scoring='accuracy', n_jobs=-1)

#     # Fit the grid search
#     grid_search.fit(X_train, y_train)

#     # Record the results
#     if grid_search.best_score_ > best_score:
#         best_model = model_name
#         best_params = grid_search.best_params_
#         best_score = grid_search.best_score_

#     results.append({
#         'Model': model_name,
#         'Best Params': grid_search.best_params_,
#         'Best CV Score': grid_search.best_score_
#     })

# # Display results
# results_df = pd.DataFrame(results)
# print("\nGrid Search Results:")
# print(results_df)

# # Best model and parameters
# print(f"\nBest Model: {best_model}")
# print(f"Best Parameters: {best_params}")
# print(f"Best Cross-Validation Score: {best_score}")

# # Evaluate the best model on the test set
# final_model = pipeline.set_params(**best_params)
# final_model.fit(X_train, y_train)
# y_pred = final_model.predict(X_test)

# print("\nTest Set Evaluation:")
# print(classification_report(y_test, y_pred))


In [None]:
# from sklearn.ensemble import RandomForestClassifier

In [None]:
# model = RandomForestClassifier(n_estimators=50, n_jobs=-1, criterion="entropy")
# model.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import joblib

# Feature selection
selector = SelectKBest(score_func=f_classif, k=20)  # Reduce to 20 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Train Random Forest with reduced parameters
model = RandomForestClassifier(
    n_estimators=50,           # Reduce number of trees
    max_depth=10,              # Limit depth
    min_samples_split=5,       # Prevent overfitting
    min_samples_leaf=3,
    random_state=42
)
model.fit(X_train_selected, y_train)

# Save model with compression
joblib.dump(model, 'model/rf_model_compressed.pkl', compress=('gzip', 3))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report
import joblib

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),                  # Optional: Scale features (may not affect Random Forest)
    ('selector', SelectKBest(score_func=f_classif, k=3)),  # Feature selection (adjust k based on your features)
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))  # Random Forest model
])

# Adjusted hyperparameter grid for large datasets
param_grid = {
    'rf__n_estimators': [50, 100, 200],  # Lower number of trees for faster training
    'rf__max_depth': [10, 20, None],     # Limit tree depth to control memory usage and overfitting
    'rf__min_samples_split': [5, 10],    # Avoid deep splits by increasing minimum samples to split a node
    'rf__min_samples_leaf': [2, 4],      # Larger leaf size reduces tree complexity
    'rf__max_features': ['sqrt', 'log2']  # Consider fewer features per split for efficiency
}

# Perform Grid Search CV to tune hyperparameters
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,                    # Reduce CV folds to save computation time
    scoring='accuracy',      # Use accuracy as the scoring metric
    n_jobs=-1                # Utilize all available CPU cores
)

# Fit the pipeline with GridSearchCV
grid_search.fit(X_train, y_train)

# Output the best model and parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Evaluate the final model on the test set
y_pred = grid_search.best_estimator_.predict(X_test)
print("\nTest Set Evaluation:")
print(classification_report(y_test, y_pred))

# Save the final pipeline to disk
joblib.dump(grid_search.best_estimator_, 'optimized_random_forest_pipeline.pkl')


In [None]:
import pickle

In [None]:
# with open("model/recommender_model.pkl", "wb") as file:
#     pickle.dump(model, file)

In [None]:
with open("model/payment_encoder.pkl", "wb") as file:
    pickle.dump(Payment_encoder, file)

In [None]:
with open("model/state_encoder.pkl", "wb") as file:
    pickle.dump(State_encoder, file)

In [None]:
with open("model/category_encoder.pkl", "wb") as file:
    pickle.dump(Category_encoder, file)

In [None]:
with open("model/k_mean_encoder.pkl", "wb") as file:
    pickle.dump(K_Mean_encoder, file)

In [None]:
with open("model/percent_scaler.pkl", "wb") as file:
    pickle.dump(Percent_scaler, file)

In [None]:
with open("X_test.pkl", "wb") as file:
    pickle.dump(X_test, file)

In [None]:
with open("y_test.pkl", "wb") as file:
    pickle.dump(y_test, file)

## Evaluation

In [None]:
import pickle
import numpy as np

In [None]:
# with open("model/recommender_model.pkl", "rb") as file:
#     model = pickle.load(file)
model = joblib.load('model/rf_model_compressed.pkl')
with open("model/k_mean_encoder.pkl", "rb") as file:
    K_Mean_encoder = pickle.load(file)
with open("model/payment_encoder.pkl", "rb") as file:
    Payment_encoder = pickle.load(file)
with open("model/state_encoder.pkl", "rb") as file:
    State_encoder = pickle.load(file)
with open("model/category_encoder.pkl", "rb") as file:
    Category_encoder = pickle.load(file)
with open("model/percent_scaler.pkl", "rb") as file:
    Percent_encoder = pickle.load(file)

In [None]:
# age	payment_method	state	discount_percent	k_means_segment
def get_recommended_products(num = 3):

    X = list()
    Category_list = dict()
    try:
        age = input("Age: ")
        X.append(age)

        payment_method = input("Payment Method: ")
        X.append(Payment_encoder.transform([payment_method])[0])

        state = input("State: ")
        X.append(State_encoder.transform([state])[0])

        discount_percent = input("Discount Percent: ")
        X.append(Percent_encoder.transform([[discount_percent]])[0][0])

        customer_type = input("Customer Type: ")
        X.append(K_Mean_encoder.transform([customer_type])[0])

        print(f"X Value: {X}")

    except ValueError:
        print("Please enter the correct inputs!")

    for category in Category_encoder.classes_:
        # Instead of adding the category as a feature,
        # create a separate model or adjust your existing model
        # to handle the category separately, perhaps as an input parameter
        # or by training the model on data that includes the category as a feature.

        # Here's an example of how to exclude the category from the features:
        temp_X = X  # Use only the original 5 features

        # Reshape temp_X to a 2D array with one row and multiple columns
        temp_X = np.array(temp_X).reshape(1, -1)

        # Pass temp_X to model.predict
        Category_list[category] = model.predict(temp_X)[0]

    # print(f"Recommended products (top {num}): ")
    # print(Category_list)

    return sorted(Category_list.items(), key=lambda x:x[1], reverse=True)[:num]

- Age: 34 (int)
- PaymentMethod: cod/ Payaxis/ Easypay (string)
- State: OK/ FL/ ND (string/ Code)
- DiscountPercent: 10 (float)
- CustomerType: best/ regular/ loyal (string/Kmean)



In [None]:
get_recommended_products(num=15)

## Conclusion