In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import numpy as np

In [9]:
# loading in and merging data 
customers_df = pd.read_csv("/Users/swetaasuresh/Desktop/customers.csv")
articles_df = pd.read_csv("/Users/swetaasuresh/Desktop/articles.csv")
transactions_iter = pd.read_csv("/Users/swetaasuresh/Desktop/transactions_train.csv", chunksize=1000000)
transactions_df = next(transactions_iter)

merged_df = transactions_df.merge(articles_df, on="article_id", how="left")
merged_df = merged_df.merge(customers_df, on="customer_id", how="left")
print(merged_df.columns)  

# identifying major categories
print(merged_df["product_group_name"].value_counts())
top_categories = merged_df["product_group_name"].value_counts().index[:9]


merged_df = merged_df[merged_df["product_group_name"].isin(top_categories)]

# Label encoding majore categories
label_encoder = LabelEncoder()
merged_df["target"] = label_encoder.fit_transform(merged_df["product_group_name"])

category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(category_mapping)


Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'product_code', 'prod_name', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')
product_group_name
Garment Upper body     499296
Garment Lower body     231284
Underwear               63964
Garment Full body       61497
Accessories             53055
Socks & Tights          31011
Shoes                   24995
Swimwear                2

In [10]:
# data processing and feature engineering 


categorical_cols = ["club_member_status", "fashion_news_frequency", "sales_channel_id"]
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = encoder.fit_transform(merged_df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

#total customer purchases feature
customer_total_purchases = merged_df.groupby("customer_id")["article_id"].count().reset_index()
customer_total_purchases.rename(columns={"article_id": "total_purchases"}, inplace=True)

merged_df = merged_df.merge(customer_total_purchases, on="customer_id", how="left")

# days since last purchase feature
merged_df["t_dat"] = pd.to_datetime(merged_df["t_dat"])  
last_purchase = merged_df.groupby("customer_id")["t_dat"].max().reset_index()
last_purchase["days_since_last_purchase"] = (merged_df["t_dat"].max() - last_purchase["t_dat"]).dt.days

merged_df = merged_df.merge(last_purchase[["customer_id", "days_since_last_purchase"]], on="customer_id", how="left")

# customer purchases by category
customer_category_counts = merged_df.groupby(["customer_id", "product_group_name"])["article_id"].count().reset_index()
customer_category_counts.rename(columns={"article_id": "purchase_count"}, inplace=True)

customer_purchase_history = customer_category_counts.pivot(index="customer_id", columns="product_group_name", values="purchase_count").fillna(0)

merged_df = merged_df.merge(customer_purchase_history, on="customer_id", how="left")

# Fill missing numerical values
merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
merged_df["price"].fillna(merged_df["price"].median(), inplace=True)
merged_df["total_purchases"].fillna(0, inplace=True)
merged_df["days_since_last_purchase"].fillna(merged_df["days_since_last_purchase"].median(), inplace=True)

# Concatenate encoded categorical features with merged_df
merged_df = pd.concat([merged_df, encoded_df], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["price"].fillna(merged_df["price"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [11]:
# Finalizing training data
selected_features = ["price", "age", "total_purchases", "days_since_last_purchase"]
selected_features += encoded_df.columns.tolist()
selected_features += customer_purchase_history.columns.tolist()

X = merged_df[selected_features]
y = merged_df["target"]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [12]:
# Split training data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [16]:
# Logistic Regression Model
logreg = LogisticRegression(max_iter=500, class_weight="balanced", random_state=42)
logreg.fit(X_train, y_train)

In [17]:
# Make predictions
y_pred_logreg = logreg.predict(X_test)

In [18]:
# Output results
print(f"Accuracy: {accuracy_score(y_test, y_pred_logreg):.4f}")
print(classification_report(y_test, y_pred_logreg, target_names=label_encoder.classes_))

Accuracy: 0.5887
                    precision    recall  f1-score   support

       Accessories       0.39      0.72      0.50     10611
 Garment Full body       0.46      0.61      0.52     12299
Garment Lower body       0.64      0.56      0.59     46257
Garment Upper body       0.83      0.53      0.65     99860
         Nightwear       0.24      0.86      0.38      2228
             Shoes       0.28      0.75      0.41      4999
    Socks & Tights       0.35      0.83      0.49      6202
          Swimwear       0.50      0.82      0.62      4544
         Underwear       0.55      0.72      0.62     12793

          accuracy                           0.59    199793
         macro avg       0.47      0.71      0.53    199793
      weighted avg       0.68      0.59      0.60    199793



In [21]:
feature_importance = np.abs(logreg.coef_)[0]  # Take absolute values of coefficients
sorted_indices = np.argsort(feature_importance)[::-1]

print("Feature Importances (Logistic Regression Coefficients):")
for i in sorted_indices:
    print(f"{selected_features[i]}: {feature_importance[i]:.4f}")


Feature Importances (Logistic Regression Coefficients):
Accessories: 1.2513
price: 0.8796
Underwear: 0.3143
Swimwear: 0.2513
Socks & Tights: 0.2474
Shoes: 0.2444
Nightwear: 0.2243
Garment Full body: 0.2018
Garment Lower body: 0.1179
total_purchases: 0.1072
sales_channel_id_2: 0.0877
sales_channel_id_1: 0.0877
age: 0.0494
days_since_last_purchase: 0.0373
Garment Upper body: 0.0364
club_member_status_nan: 0.0149
club_member_status_LEFT CLUB: 0.0143
club_member_status_ACTIVE: 0.0104
fashion_news_frequency_NONE: 0.0086
fashion_news_frequency_Regularly: 0.0085
club_member_status_PRE-CREATE: 0.0044
fashion_news_frequency_Monthly: 0.0038
fashion_news_frequency_nan: 0.0008
