In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
articles = pd.read_csv('/content/drive/MyDrive/articles.csv')
customers = pd.read_csv('/content/drive/MyDrive/customers.csv')
transactions = pd.read_csv('/content/drive/MyDrive/transactions_train.csv', chunksize=1000000)

In [None]:
# loading in and merging data

transactions_df = next(transactions)

merged_df = transactions_df.merge(articles, on="article_id", how="left")
merged_df = merged_df.merge(customers, on="customer_id", how="left")
print(merged_df.columns)

# identifying major categories
print(merged_df["product_group_name"].value_counts())
top_categories = merged_df["product_group_name"].value_counts().index[:9]


merged_df = merged_df[merged_df["product_group_name"].isin(top_categories)]

# Label encoding majore categories
label_encoder = LabelEncoder()
merged_df["target"] = label_encoder.fit_transform(merged_df["product_group_name"])

category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(category_mapping)


Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'product_code', 'prod_name', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')
product_group_name
Garment Upper body     499296
Garment Lower body     231284
Underwear               63964
Garment Full body       61497
Accessories             53055
Socks & Tights          31011
Shoes                   24995
Swimwear                2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df["target"] = label_encoder.fit_transform(merged_df["product_group_name"])


In [None]:
# One-hot encoding categorical variables
categorical_cols = ["club_member_status", "fashion_news_frequency", "sales_channel_id"]
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = encoder.fit_transform(merged_df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

# Total customer purchases feature
customer_total_purchases = merged_df.groupby("customer_id")["article_id"].count().reset_index()
customer_total_purchases.rename(columns={"article_id": "total_purchases"}, inplace=True)
merged_df = merged_df.merge(customer_total_purchases, on="customer_id", how="left")

# Days since last purchase feature
merged_df["t_dat"] = pd.to_datetime(merged_df["t_dat"])
last_purchase = merged_df.groupby("customer_id")["t_dat"].max().reset_index()
last_purchase["days_since_last_purchase"] = (merged_df["t_dat"].max() - last_purchase["t_dat"]).dt.days
merged_df = merged_df.merge(last_purchase[["customer_id", "days_since_last_purchase"]], on="customer_id", how="left")

# Customer purchases by category
customer_category_counts = merged_df.groupby(["customer_id", "product_group_name"])["article_id"].count().reset_index()
customer_category_counts.rename(columns={"article_id": "purchase_count"}, inplace=True)
customer_purchase_history = customer_category_counts.pivot(index="customer_id", columns="product_group_name", values="purchase_count").fillna(0)
merged_df = merged_df.merge(customer_purchase_history, on="customer_id", how="left")

# Fill missing numerical values
merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
merged_df["price"].fillna(merged_df["price"].median(), inplace=True)
merged_df["total_purchases"].fillna(0, inplace=True)
merged_df["days_since_last_purchase"].fillna(merged_df["days_since_last_purchase"].median(), inplace=True)

# Concatenate encoded categorical features with merged_df
merged_df = pd.concat([merged_df, encoded_df], axis=1)

# Feature selection
selected_features = ["price", "age", "total_purchases", "days_since_last_purchase"]
selected_features += encoded_df.columns.tolist()
selected_features += customer_purchase_history.columns.tolist()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["age"].fillna(merged_df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["price"].fillna(merged_df["price"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [None]:
sampled_df = merged_df.sample(frac=0.1, random_state=42)  # Use 10% of the data
X = sampled_df[selected_features]
y = sampled_df["target"]


# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='minkowski')
knn.fit(X_train, y_train)

# Predictions
y_pred = knn.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Model Accuracy: {accuracy:.4f}")

report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print("Classification Report:\n", report)

KNN Model Accuracy: 0.6327
Classification Report:
                     precision    recall  f1-score   support

       Accessories       0.50      0.52      0.51      1066
 Garment Full body       0.47      0.42      0.45      1223
Garment Lower body       0.56      0.53      0.54      4594
Garment Upper body       0.70      0.76      0.73     10012
         Nightwear       0.43      0.32      0.37       221
             Shoes       0.57      0.39      0.46       499
    Socks & Tights       0.58      0.53      0.55       622
          Swimwear       0.66      0.63      0.65       465
         Underwear       0.65      0.52      0.58      1278

          accuracy                           0.63     19980
         macro avg       0.57      0.51      0.54     19980
      weighted avg       0.63      0.63      0.63     19980

