# Shopping Trends Data Modeling

### Import Libraries

In [23]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualization
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

### Workflow

To fit and train this model, we’ll be following [The Machine Learning Workflow infographic](https://www.datacamp.com/blog/a-beginner-s-guide-to-the-machine-learning-workflow); however, as our data is pretty clean, we won’t be carrying out every step. We will do the following:

* Feature engineering
* Split the data
* Train the model
* Hyperparameter tuning
* Assess model performance


In [24]:
df = pd.read_csv("data/shopping_trends_final.csv")
# df = df.iloc[:, 1:]
df.head()

Unnamed: 0.1,Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              3900 non-null   int64  
 1   Customer ID             3900 non-null   int64  
 2   Age                     3900 non-null   int64  
 3   Gender                  3900 non-null   object 
 4   Item Purchased          3900 non-null   object 
 5   Category                3900 non-null   object 
 6   Purchase Amount (USD)   3900 non-null   int64  
 7   Location                3900 non-null   object 
 8   Size                    3900 non-null   object 
 9   Color                   3900 non-null   object 
 10  Season                  3900 non-null   object 
 11  Review Rating           3900 non-null   float64
 12  Subscription Status     3900 non-null   object 
 13  Shipping Type           3900 non-null   object 
 14  Discount Applied        3900 non-null   

In [31]:
df.isna().sum()

Unnamed: 0                0
Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64

In [32]:
df.duplicated().sum()

0

### Preprocessing Data for Random Forests

Tree-based models are much more robust to outliers than linear models, and they do not need variables to be normalized to work. As such, we need to do very little preprocessing on our data.

In [26]:
object_df = df.select_dtypes(["object"])
object_df.head()

Unnamed: 0,Gender,Item Purchased,Category,Location,Size,Color,Season,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Payment Method,Frequency of Purchases
0,Male,Blouse,Clothing,Kentucky,L,Gray,Winter,Yes,Express,Yes,Yes,Venmo,Fortnightly
1,Male,Sweater,Clothing,Maine,L,Maroon,Winter,Yes,Express,Yes,Yes,Cash,Fortnightly
2,Male,Jeans,Clothing,Massachusetts,S,Maroon,Spring,Yes,Free Shipping,Yes,Yes,Credit Card,Weekly
3,Male,Sandals,Footwear,Rhode Island,M,Maroon,Spring,Yes,Next Day Air,Yes,Yes,PayPal,Weekly
4,Male,Blouse,Clothing,Oregon,M,Turquoise,Spring,Yes,Free Shipping,Yes,Yes,PayPal,Annually


In [27]:
int_var = df.select_dtypes(["int64", "float64"])
int_var.head()

Unnamed: 0.1,Unnamed: 0,Customer ID,Age,Purchase Amount (USD),Review Rating,Previous Purchases
0,0,1,55,53,3.1,14
1,1,2,19,64,3.1,2
2,2,3,50,73,3.1,23
3,3,4,21,90,3.5,49
4,4,5,45,49,2.7,31


In [33]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

def encoder(data):
    data = enc.fit_transform(data)
    
for col in object_df.columns:
    object_df[col] = encoder(object_df[col])
    
# Print the encoded data
object_df.head()


Unnamed: 0,Gender_Female,Gender_Male,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,Item Purchased_Dress,Item Purchased_Gloves,Item Purchased_Handbag,...,Payment Method_Debit Card,Payment Method_PayPal,Payment Method_Venmo,Frequency of Purchases_Annually,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [36]:
import pandas as pd

def limited_one_hot_encode_dataframe(df, columns, limit=5):
  """
  Encodes categorical variables in a pandas DataFrame using get_dummies with a limit on the number of new columns.

  Args:
    df: pandas.DataFrame containing the data.
    columns: A list of column names to be encoded.
    limit: Maximum number of new columns to create for each encoded feature (default: 5).

  Returns:
    pandas.DataFrame with limited encoded columns.
  """

  encoded_columns = []
  for col in columns:
    dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
    if len(dummies.columns) > limit:
      # Select and keep only the `limit` most frequent categories
      dummies = dummies.mode(axis=0, dropna=True)
    encoded_columns.append(dummies)
  encoded_df = pd.concat([df, *encoded_columns], axis=1)
  return encoded_df


encoded_df = limited_one_hot_encode_dataframe(object_df, object_df.columns, limit=5)

print("Encoded DataFrame with limited columns:")
print(encoded_df)


Encoded DataFrame with limited columns:
      Gender_Female  Gender_Male  Item Purchased_Backpack  \
0                 0            0                        0   
1                 0            0                        0   
2                 0            0                        0   
3                 0            0                        0   
4                 0            0                        0   
...             ...          ...                      ...   
3895              0            0                        0   
3896              0            0                        0   
3897              0            0                        0   
3898              0            0                        0   
3899              0            0                        0   

      Item Purchased_Belt  Item Purchased_Blouse  Item Purchased_Boots  \
0                       0                      0                     0   
1                       0                      0                     0   
2    

In [29]:
for column in object_df.columns:
    print("\n", object_df[column].unique())


 [0 1]

 [1 0]

 [0 1]

 [0 1]

 [1 0]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [1 0]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [1 0]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [1 0]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [1 0]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [1 0]

 [0 1]

 [1 0]

 [0 1]

 [1 0]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [0 1]

 [1 0]

 [0 1]
