In [1]:
#IMPORT NECESSARY PACKAGES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
#LOADING THE DATASET

project= pd.read_csv("/Users/sumaiyairshad/Desktop/MS in Data Science/CAPSTONE PROJECT/E Commerce/E-commerce Customer Segmentation Dataset.csv")
project

Unnamed: 0,Customer_ID,Customer_Name,Age,Gender,Annual_Income,Spending_Score,Marital_Status,Product_Category,Years_as_Customer,Number_of_Orders,Average_Order_Value,Loyalty_Membership,Discount_Usage,Preferred_Payment_Method,Preferred_Delivery_Option,Device_Used,Last_Activity,Customer_Region,Review_Score,Customer_Segment
0,1,Michelle Charles,69,0,192704,60,1,Fashion,8,78,182.40,1,0,PayPal,Standard,Tablet,33,Asia,2,0
1,2,Kaylee Medina,62,0,165578,36,1,Sports,7,25,342.85,1,1,Credit Card,Standard,Desktop,307,Europe,4,1
2,3,Ronald Hoffman,52,0,66523,46,0,Home,4,68,275.57,1,0,Debit Card,Same-Day,Tablet,131,Asia,2,0
3,4,Sandra Mcguire,56,1,193559,83,0,Sports,5,88,97.62,1,1,Net Banking,Standard,Mobile,185,North America,1,1
4,5,Andrew Mcdonald,56,0,57461,60,0,Sports,5,86,438.02,0,1,Net Banking,Same-Day,Desktop,29,South America,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,21996,Tonya Kramer,22,1,60778,25,1,Home,2,41,642.65,1,1,Credit Card,Same-Day,Mobile,27,South America,2,1
21996,21997,Melvin Valencia,38,0,129255,14,0,Home,12,23,598.19,0,1,Debit Card,Same-Day,Desktop,135,Asia,2,0
21997,21998,Shannon Mitchell,58,0,194525,81,1,Sports,8,11,204.23,1,0,Credit Card,Standard,Desktop,301,Asia,1,2
21998,21999,Allison West,23,0,28331,82,1,Home,2,59,247.71,1,0,Credit Card,Same-Day,Mobile,157,North America,5,2


In [3]:
#CONVERTING THE CATEGORICAL COLUMNS INTO NUMERICAL COLUMNS USING ENCODING

one_hot_columns=['Product_Category','Preferred_Payment_Method','Device_Used','Customer_Region']
project=pd.get_dummies(project, columns=one_hot_columns)

In [4]:
#REMOVING IRRELEVANT FEATURES(Customer_ID and Customer_Name)

project_cleaned=project.drop(columns=['Customer_ID','Customer_Name'])

#TO MAKE SURE BOTH THE COLUMNS ARE DROPPED
project_cleaned.head()

Unnamed: 0,Age,Gender,Annual_Income,Spending_Score,Marital_Status,Years_as_Customer,Number_of_Orders,Average_Order_Value,Loyalty_Membership,Discount_Usage,...,Preferred_Payment_Method_Debit Card,Preferred_Payment_Method_Net Banking,Preferred_Payment_Method_PayPal,Device_Used_Desktop,Device_Used_Mobile,Device_Used_Tablet,Customer_Region_Asia,Customer_Region_Europe,Customer_Region_North America,Customer_Region_South America
0,69,0,192704,60,1,8,78,182.4,1,0,...,0,0,1,0,0,1,1,0,0,0
1,62,0,165578,36,1,7,25,342.85,1,1,...,0,0,0,1,0,0,0,1,0,0
2,52,0,66523,46,0,4,68,275.57,1,0,...,1,0,0,0,0,1,1,0,0,0
3,56,1,193559,83,0,5,88,97.62,1,1,...,0,1,0,0,1,0,0,0,1,0
4,56,0,57461,60,0,5,86,438.02,0,1,...,0,1,0,1,0,0,0,0,0,1


In [5]:
#FEATURE ENGINEERING
#LET ME ADD SOME FEATURES AND THEN DO THE EDA AGAIN TO GAIN SOME MORE UNDERSTANDING

#Wealthier customers might spend differently compared to others.
project_cleaned['Income_Spending_Interaction'] = project_cleaned['Annual_Income'] * project_cleaned['Spending_Score']

#Long-term customers with loyalty membership might exhibit unique behaviors.
project_cleaned['Loyalty_Customer_Years'] = project_cleaned['Years_as_Customer'] * project_cleaned['Loyalty_Membership']

#High spenders who frequently order may indicate a premium customer segment.
project_cleaned['Order_Value_Frequency'] = project_cleaned['Average_Order_Value'] * project_cleaned['Number_of_Orders']

project_cleaned.columns

Index(['Age', 'Gender', 'Annual_Income', 'Spending_Score', 'Marital_Status',
       'Years_as_Customer', 'Number_of_Orders', 'Average_Order_Value',
       'Loyalty_Membership', 'Discount_Usage', 'Preferred_Delivery_Option',
       'Last_Activity', 'Review_Score', 'Customer_Segment',
       'Product_Category_Beauty', 'Product_Category_Electronics',
       'Product_Category_Fashion', 'Product_Category_Home',
       'Product_Category_Sports', 'Preferred_Payment_Method_Credit Card',
       'Preferred_Payment_Method_Debit Card',
       'Preferred_Payment_Method_Net Banking',
       'Preferred_Payment_Method_PayPal', 'Device_Used_Desktop',
       'Device_Used_Mobile', 'Device_Used_Tablet', 'Customer_Region_Asia',
       'Customer_Region_Europe', 'Customer_Region_North America',
       'Customer_Region_South America', 'Income_Spending_Interaction',
       'Loyalty_Customer_Years', 'Order_Value_Frequency'],
      dtype='object')

In [6]:
#Calculating orders per year to standardize customer behavior
project_cleaned['Orders_per_Year'] = project_cleaned['Number_of_Orders'] / project_cleaned['Years_as_Customer']

#Ratio of Spending_Score to Average_Order_Value.
project_cleaned['Spending_Efficiency'] = project_cleaned['Spending_Score'] / project_cleaned['Average_Order_Value']


In [7]:
#Discount Propensity: Creating a binary feature for frequent discount users.

threshold=1 
project_cleaned['Frequent_Discount_User'] = (project_cleaned['Discount_Usage'] > threshold).astype(int)
project_cleaned.tail(56)

#This creates a binary feature where users who have used at least one discount 
#(i.e., Discount_Usage = 1) will be marked as 1 in the Frequent_Discount_User column,
#and others will be marked as 0.

Unnamed: 0,Age,Gender,Annual_Income,Spending_Score,Marital_Status,Years_as_Customer,Number_of_Orders,Average_Order_Value,Loyalty_Membership,Discount_Usage,...,Customer_Region_Asia,Customer_Region_Europe,Customer_Region_North America,Customer_Region_South America,Income_Spending_Interaction,Loyalty_Customer_Years,Order_Value_Frequency,Orders_per_Year,Spending_Efficiency,Frequent_Discount_User
21944,32,0,103468,92,1,4,87,774.47,0,0,...,0,0,0,1,9519056,0,67378.89,21.75,0.118791,0
21945,65,1,128109,3,0,2,86,448.52,1,1,...,0,0,1,0,384327,2,38572.72,43.0,0.006689,0
21946,34,0,59336,30,1,4,95,948.97,0,1,...,0,0,1,0,1780080,0,90152.15,23.75,0.031613,0
21947,49,1,132646,23,1,1,78,220.07,0,0,...,0,1,0,0,3050858,0,17165.46,78.0,0.104512,0
21948,23,0,191161,98,0,12,89,287.84,1,1,...,1,0,0,0,18733778,12,25617.76,7.416667,0.340467,0
21949,53,1,32880,26,1,7,94,355.14,1,1,...,1,0,0,0,854880,7,33383.16,13.428571,0.073211,0
21950,69,0,160329,4,1,14,71,829.87,1,0,...,0,0,0,1,641316,14,58920.77,5.071429,0.00482,0
21951,19,1,136906,71,0,6,87,222.43,0,0,...,1,0,0,0,9720326,0,19351.41,14.5,0.319202,0
21952,62,0,93445,61,1,11,88,157.07,0,0,...,0,0,1,0,5700145,0,13822.16,8.0,0.388362,0
21953,65,0,196661,12,1,13,7,766.2,1,1,...,0,0,1,0,2359932,13,5363.4,0.538462,0.015662,0


In [8]:
#CHECKING WHICH COLUMN IS STILL AN OBJECT
#THIS SHOWS PREFERRED_DELIVERY_OPTION IS STILL AN OBJECT
project_cleaned.dtypes

Age                                       int64
Gender                                    int64
Annual_Income                             int64
Spending_Score                            int64
Marital_Status                            int64
Years_as_Customer                         int64
Number_of_Orders                          int64
Average_Order_Value                     float64
Loyalty_Membership                        int64
Discount_Usage                            int64
Preferred_Delivery_Option                object
Last_Activity                             int64
Review_Score                              int64
Customer_Segment                          int64
Product_Category_Beauty                   uint8
Product_Category_Electronics              uint8
Product_Category_Fashion                  uint8
Product_Category_Home                     uint8
Product_Category_Sports                   uint8
Preferred_Payment_Method_Credit Card      uint8
Preferred_Payment_Method_Debit Card     

In [9]:
#HERE, I'LL CONVERT PREFERRED DELIVERY OPTION INTO NUMERICAL FEATURE USING LABEL ENCODING

# Define a custom order for categories
preferred_order = {'Standard': 0, 'Same-Day': 1, 'Express': 2}  # Example categories

# Map the categories to the custom order and overwrite the original column
project_cleaned['Preferred_Delivery_Option'] = project_cleaned['Preferred_Delivery_Option'].map(preferred_order)

# Now check the data type
print(project_cleaned.dtypes)  # This will show that 'Preferred_Delivery_Option' is now int64, not object


Age                                       int64
Gender                                    int64
Annual_Income                             int64
Spending_Score                            int64
Marital_Status                            int64
Years_as_Customer                         int64
Number_of_Orders                          int64
Average_Order_Value                     float64
Loyalty_Membership                        int64
Discount_Usage                            int64
Preferred_Delivery_Option                 int64
Last_Activity                             int64
Review_Score                              int64
Customer_Segment                          int64
Product_Category_Beauty                   uint8
Product_Category_Electronics              uint8
Product_Category_Fashion                  uint8
Product_Category_Home                     uint8
Product_Category_Sports                   uint8
Preferred_Payment_Method_Credit Card      uint8
Preferred_Payment_Method_Debit Card     

In [10]:
#FEATURE SELECTION COMBINING ALL THE THREE METHODS

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import numpy as np


X = project_cleaned.drop(columns=['Customer_Segment'])  # Features (excluding the target)
y = project_cleaned['Customer_Segment']  # Target variable

#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#Univariate Feature Selection (SelectKBest with f_classif)
univariate_selector = SelectKBest(f_classif, k=10)  # Select top 10 features (change as needed)
X_train_univariate = univariate_selector.fit_transform(X_train, y_train)
selected_univariate_features = X.columns[univariate_selector.get_support()]

#Recursive Feature Elimination (RFE) using Logistic Regression
rfe_model = LogisticRegression(max_iter=1000, random_state=42)
rfe_selector = RFE(rfe_model, n_features_to_select=10)  # Select top 10 features (change as needed)
X_train_rfe = rfe_selector.fit_transform(X_train, y_train)
selected_rfe_features = X.columns[rfe_selector.get_support()]

#Feature Importance using Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
importances = rf_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
selected_rf_features = importance_df.head(10)['Feature'].values  # Top 10 features (change as needed)

#Combine features selected by all methods
# We can take the intersection of features selected by the three methods
combined_selected_features = list(set(selected_univariate_features) & set(selected_rfe_features) & set(selected_rf_features))

#Display the final selected features
print("Final Selected Features:", combined_selected_features)

#Transform the dataset to use only the selected features
X_train_final = X_train[combined_selected_features]
X_test_final = X_test[combined_selected_features]

  f = msb / msw


Final Selected Features: ['Spending_Score', 'Average_Order_Value', 'Income_Spending_Interaction', 'Orders_per_Year']


In [11]:
#MODEL BUILDING


#USING XGBoost Classifier 

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Fit the XGBoost model on the selected features of the training data
xgb_model.fit(X_train_final, y_train)

# Make predictions on the test data
xgb_predictions = xgb_model.predict(X_test_final)

# Evaluate the XGBoost model
print("XGBoost Model Accuracy:", accuracy_score(y_test, xgb_predictions))
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_predictions))



Parameters: { "use_label_encoder" } are not used.



XGBoost Model Accuracy: 0.32727272727272727
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.33      0.33      1467
           1       0.32      0.32      0.32      1458
           2       0.33      0.33      0.33      1475

    accuracy                           0.33      4400
   macro avg       0.33      0.33      0.33      4400
weighted avg       0.33      0.33      0.33      4400



In [12]:
#USING GRADIENTBOOSTING

# Gradient Boosting Classifier using selected features

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Fit the Gradient Boosting model on the selected features of the training data
gb_model.fit(X_train_final, y_train)

# Make predictions on the test data
gb_predictions = gb_model.predict(X_test_final)

# Evaluate the Gradient Boosting model
print("Gradient Boosting Model Accuracy:", accuracy_score(y_test, gb_predictions))
print("Gradient Boosting Classification Report:\n", classification_report(y_test, gb_predictions))


Gradient Boosting Model Accuracy: 0.32704545454545453
Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.32      0.32      1467
           1       0.32      0.29      0.31      1458
           2       0.33      0.37      0.35      1475

    accuracy                           0.33      4400
   macro avg       0.33      0.33      0.33      4400
weighted avg       0.33      0.33      0.33      4400



In [None]:
#USING SVM

# SVM Classifier using selected features

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize the SVM model
svm_model = SVC(kernel='linear')

# Fit the SVM model on the selected features of the training data
svm_model.fit(X_train_final, y_train)

# Make predictions on the test data
svm_predictions = svm_model.predict(X_test_final)

# Evaluate the SVM model
print("SVM Model Accuracy:", accuracy_score(y_test, svm_predictions))
print("SVM Classification Report:\n", classification_report(y_test, svm_predictions))
