<a href="https://colab.research.google.com/github/Swetaa23/546_project/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

customers_df = pd.read_csv("/content/drive/My Drive/MGTE/4B/546/customers.csv")
articles_df = pd.read_csv("/content/drive/My Drive/MGTE/4B/546/articles.csv")
transactions_iter = pd.read_csv("/content/drive/My Drive/MGTE/4B/546/Transactions Train.csv", chunksize=1000000)
transactions_df = next(transactions_iter)

merged_df = transactions_df.merge(articles_df, on="article_id", how="left")
merged_df = merged_df.merge(customers_df, on="customer_id", how="left")
print(merged_df.columns)

# identifying major categories
print(merged_df["product_group_name"].value_counts())
top_categories = merged_df["product_group_name"].value_counts().index[:9]


merged_df = merged_df[merged_df["product_group_name"].isin(top_categories)]

# Label encoding majore categories
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
merged_df["target"] = label_encoder.fit_transform(merged_df["product_group_name"])

category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(category_mapping)

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'product_code', 'prod_name', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')
product_group_name
Garment Upper body     499296
Garment Lower body     231284
Underwear               63964
Garment Full body       61497
Accessories             53055
Socks & Tights          31011
Shoes                   24995
Swimwear                2

In [None]:
# data processing and feature engineering

from sklearn.preprocessing import OneHotEncoder

categorical_cols = ["club_member_status", "fashion_news_frequency", "sales_channel_id"]
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = encoder.fit_transform(merged_df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

#total customer purchases feature
customer_total_purchases = merged_df.groupby("customer_id")["article_id"].count().reset_index()
customer_total_purchases.rename(columns={"article_id": "total_purchases"}, inplace=True)

merged_df = merged_df.merge(customer_total_purchases, on="customer_id", how="left")

# days since last purchase feature
merged_df["t_dat"] = pd.to_datetime(merged_df["t_dat"])
last_purchase = merged_df.groupby("customer_id")["t_dat"].max().reset_index()
last_purchase["days_since_last_purchase"] = (merged_df["t_dat"].max() - last_purchase["t_dat"]).dt.days

merged_df = merged_df.merge(last_purchase[["customer_id", "days_since_last_purchase"]], on="customer_id", how="left")

# customer purchases by category
customer_category_counts = merged_df.groupby(["customer_id", "product_group_name"])["article_id"].count().reset_index()
customer_category_counts.rename(columns={"article_id": "purchase_count"}, inplace=True)

customer_purchase_history = customer_category_counts.pivot(index="customer_id", columns="product_group_name", values="purchase_count").fillna(0)

merged_df = merged_df.merge(customer_purchase_history, on="customer_id", how="left")

# Fill missing numerical values
merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
merged_df["price"].fillna(merged_df["price"].median(), inplace=True)
merged_df["total_purchases"].fillna(0, inplace=True)
merged_df["days_since_last_purchase"].fillna(merged_df["days_since_last_purchase"].median(), inplace=True)

# Concatenate encoded categorical features with merged_df
merged_df = pd.concat([merged_df, encoded_df], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["price"].fillna(merged_df["price"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [None]:
# Finalizing training data
from sklearn.preprocessing import StandardScaler

selected_features = ["price", "age", "total_purchases", "days_since_last_purchase"]
selected_features += encoded_df.columns.tolist()
selected_features += customer_purchase_history.columns.tolist()

X = merged_df[selected_features]
y = merged_df["target"]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split training data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# convert training and test sets into DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# parameters for multi-class classification
num_classes = y.nunique()
params = {
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'eta': 0.1,
    'max_depth': 6,
    'eval_metric': 'mlogloss',
    'seed': 42
}

num_rounds = 100
watchlist = [(dtrain, 'train'), (dtest, 'eval')]
model = xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds=10)

y_pred_proba = model.predict(dtest)
y_pred = np.argmax(y_pred_proba, axis=1)




[0]	train-mlogloss:1.96587	eval-mlogloss:1.96608
[1]	train-mlogloss:1.79765	eval-mlogloss:1.79805
[2]	train-mlogloss:1.66408	eval-mlogloss:1.66456
[3]	train-mlogloss:1.55414	eval-mlogloss:1.55470
[4]	train-mlogloss:1.46124	eval-mlogloss:1.46188
[5]	train-mlogloss:1.38122	eval-mlogloss:1.38191
[6]	train-mlogloss:1.31175	eval-mlogloss:1.31255
[7]	train-mlogloss:1.25046	eval-mlogloss:1.25134
[8]	train-mlogloss:1.19619	eval-mlogloss:1.19712
[9]	train-mlogloss:1.14782	eval-mlogloss:1.14879
[10]	train-mlogloss:1.10463	eval-mlogloss:1.10571
[11]	train-mlogloss:1.06559	eval-mlogloss:1.06676
[12]	train-mlogloss:1.03050	eval-mlogloss:1.03174
[13]	train-mlogloss:0.99884	eval-mlogloss:1.00013
[14]	train-mlogloss:0.97012	eval-mlogloss:0.97146
[15]	train-mlogloss:0.94399	eval-mlogloss:0.94538
[16]	train-mlogloss:0.92018	eval-mlogloss:0.92163
[17]	train-mlogloss:0.89842	eval-mlogloss:0.89992
[18]	train-mlogloss:0.87850	eval-mlogloss:0.88003
[19]	train-mlogloss:0.86027	eval-mlogloss:0.86188
[20]	train

In [17]:
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 72.05%

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.55      0.62     10611
           1       0.65      0.49      0.56     12299
           2       0.68      0.62      0.65     46257
           3       0.75      0.84      0.79     99860
           4       0.64      0.36      0.46      2228
           5       0.67      0.48      0.56      4999
           6       0.66      0.70      0.68      6202
           7       0.72      0.74      0.73      4544
           8       0.73      0.67      0.70     12793

    accuracy                           0.72    199793
   macro avg       0.69      0.60      0.64    199793
weighted avg       0.72      0.72      0.71    199793

