In [24]:
# loading in and merging data 

import pandas as pd

customers_df = pd.read_csv("/Users/salehbhatti/Downloads/customers.csv")
articles_df = pd.read_csv("/Users/salehbhatti/Downloads/articles.csv")
transactions_iter = pd.read_csv("/Users/salehbhatti/Downloads/transactions_train.csv", chunksize=1000000)
transactions_df = next(transactions_iter)

merged_df = transactions_df.merge(articles_df, on="article_id", how="left")
merged_df = merged_df.merge(customers_df, on="customer_id", how="left")
print(merged_df.columns)  

# identifying major categories
print(merged_df["product_group_name"].value_counts())
top_categories = merged_df["product_group_name"].value_counts().index[:9]


merged_df = merged_df[merged_df["product_group_name"].isin(top_categories)]

# Label encoding majore categories
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
merged_df["target"] = label_encoder.fit_transform(merged_df["product_group_name"])

category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(category_mapping)


Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'product_code', 'prod_name', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')
product_group_name
Garment Upper body     499296
Garment Lower body     231284
Underwear               63964
Garment Full body       61497
Accessories             53055
Socks & Tights          31011
Shoes                   24995
Swimwear                2

In [27]:
# data processing and feature engineering 

from sklearn.preprocessing import OneHotEncoder

categorical_cols = ["club_member_status", "fashion_news_frequency", "sales_channel_id"]
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = encoder.fit_transform(merged_df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

#total customer purchases feature
customer_total_purchases = merged_df.groupby("customer_id")["article_id"].count().reset_index()
customer_total_purchases.rename(columns={"article_id": "total_purchases"}, inplace=True)

merged_df = merged_df.merge(customer_total_purchases, on="customer_id", how="left")

# days since last purchase feature
merged_df["t_dat"] = pd.to_datetime(merged_df["t_dat"])  
last_purchase = merged_df.groupby("customer_id")["t_dat"].max().reset_index()
last_purchase["days_since_last_purchase"] = (merged_df["t_dat"].max() - last_purchase["t_dat"]).dt.days

merged_df = merged_df.merge(last_purchase[["customer_id", "days_since_last_purchase"]], on="customer_id", how="left")

# customer purchases by category
customer_category_counts = merged_df.groupby(["customer_id", "product_group_name"])["article_id"].count().reset_index()
customer_category_counts.rename(columns={"article_id": "purchase_count"}, inplace=True)

customer_purchase_history = customer_category_counts.pivot(index="customer_id", columns="product_group_name", values="purchase_count").fillna(0)

merged_df = merged_df.merge(customer_purchase_history, on="customer_id", how="left")

# Fill missing numerical values
merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
merged_df["price"].fillna(merged_df["price"].median(), inplace=True)
merged_df["total_purchases"].fillna(0, inplace=True)
merged_df["days_since_last_purchase"].fillna(merged_df["days_since_last_purchase"].median(), inplace=True)

# Concatenate encoded categorical features with merged_df
merged_df = pd.concat([merged_df, encoded_df], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["price"].fillna(merged_df["price"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [28]:
# Finalizing training data
from sklearn.preprocessing import StandardScaler

selected_features = ["price", "age", "total_purchases", "days_since_last_purchase"]
selected_features += encoded_df.columns.tolist()
selected_features += customer_purchase_history.columns.tolist()

X = merged_df[selected_features]
y = merged_df["target"]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [21]:
# Split training data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [22]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=300,  # More trees
    max_depth=30,  # Allow deeper trees
    min_samples_split=5,  # Reduce overfitting
    min_samples_leaf=3,  # Smaller leaves
    max_features="sqrt",  # Random subset of features
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)


In [23]:
# Make predictions and output results

from sklearn.metrics import accuracy_score, classification_report

y_pred_rf = rf_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))


Accuracy: 0.6419
                    precision    recall  f1-score   support

       Accessories       0.48      0.70      0.57     10611
 Garment Full body       0.42      0.69      0.53     12299
Garment Lower body       0.64      0.63      0.63     46257
Garment Upper body       0.87      0.59      0.70     99860
         Nightwear       0.27      0.70      0.39      2228
             Shoes       0.39      0.72      0.51      4999
    Socks & Tights       0.49      0.84      0.62      6202
          Swimwear       0.53      0.88      0.66      4544
         Underwear       0.55      0.76      0.64     12793

          accuracy                           0.64    199793
         macro avg       0.52      0.72      0.58    199793
      weighted avg       0.71      0.64      0.65    199793



In [11]:
# feature importance analysis 
feature_names = selected_features 
sorted_indices = np.argsort(feature_importances)[::-1]

print("Feature Importances:")
for i in sorted_indices:
    print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

Feature Importances:
price: 0.1463
Swimwear: 0.1144
Nightwear: 0.1135
Shoes: 0.0900
Socks & Tights: 0.0888
Underwear: 0.0817
Garment Full body: 0.0784
Accessories: 0.0748
Garment Lower body: 0.0521
Garment Upper body: 0.0447
total_purchases: 0.0436
age: 0.0391
days_since_last_purchase: 0.0327
