In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

import plotly.express as px

import warnings 
warnings.filterwarnings('ignore')

In [None]:
df_article = pd.read_csv('articles.csv')
df_customer = pd.read_csv('customers.csv')
df_transaction = pd.read_csv('transactions_train.csv')

In [None]:
def describe_df(df):
    list_item = []
    for col in df.columns:
        list_item.append([
            col,
            df[col].dtype,
            df[col].isna().sum(),
            round(df[col].isna().sum()/len(df[col])*100, 2),
            df[col].nunique(),
            round(df[col].nunique()/len(df[col])*100, 2),
            list(df[col].unique()[:5])
        ])
    return pd.DataFrame(
        columns=['feature', 'type', '# null', '% null', '# unique', '% unique', 'sample'],
        data = list_item
    )




In [None]:
assert df_customer.customer_id.nunique() == df_customer.shape[0]
describe_df(df_customer)

In [None]:
mapping = {"FN": 0, "Active": 0, "club_member_status": "N.C", "fashion_news_frequency": "N.C", "age": 0}

df_customer.fillna(value=mapping, inplace=True)
df_customer.drop(columns="postal_code", inplace=True)

for col in ["FN", "age", "Active"]:
    df_customer[col] = df_customer[col].astype(np.int8)

In [None]:
assert df_customer.customer_id.nunique() == df_customer.shape[0]
describe_df(df_customer)

In [None]:
df_customer.shape

In [None]:
df_transaction.t_dat = pd.to_datetime(df_transaction.t_dat, infer_datetime_format=True)
describe_df(df_transaction)

In [None]:
assert df_article.article_id.nunique() == df_article.shape[0]
describe_df(df_article)

In [None]:
df_article.drop(columns='detail_desc',inplace=True)

In [None]:
assert df_article.article_id.nunique() == df_article.shape[0]
describe_df(df_article)

In [None]:
df_art_modified = df_article[['article_id','product_type_name','product_group_name','graphical_appearance_name','colour_group_name','index_name','section_name','garment_group_name']]

In [None]:
df_art_modified.head()

In [None]:
df_transaction.t_dat = pd.to_datetime(df_transaction.t_dat, infer_datetime_format=True)
describe_df(df_transaction)

### To find most popular items in last 3 months

In [None]:
df = df_transaction.merge(df_art_modified,how='inner', on='article_id')
df.head()

In [None]:
df.columns

In [None]:
# Get the max date in the dataset
max_date = df['t_dat'].max()
max_date

In [None]:
# Filter for last 3 months
last_3_months = df[df['t_dat'] >= (max_date - pd.DateOffset(months=3))]
last_3_months.head()

In [None]:
# Count article popularity
top_articles = (
    last_3_months['article_id']
    .value_counts()
    .head(10)
    .reset_index()
)

In [None]:
top_articles

In [None]:
# Get article metadata (first occurrence per article)
article_metadata = df.drop_duplicates(subset='article_id')[[
    'article_id', 'product_type_name', 'product_group_name',
    'graphical_appearance_name', 'colour_group_name', 'index_name',
    'section_name', 'garment_group_name'
]]

In [None]:
# Merge top articles with metadata
top_articles = top_articles.merge(article_metadata, on='article_id', how='left')

top_articles.head()

### Top 5 articlea in in each meta data category in last 3 months

In [None]:
# List of metadata columns
metadata_columns = [
    'product_type_name', 'product_group_name', 'graphical_appearance_name',
    'colour_group_name', 'index_name', 'section_name', 'garment_group_name'
]

# Compute popularity per article
article_counts = (
    last_3_months.groupby('article_id')
    .size()
    .reset_index(name='purchase_count')
)

# Add metadata by merging with original data (to get article attributes)
article_metadata = df.drop_duplicates('article_id')[
    ['article_id'] + metadata_columns
]
article_popularity = article_counts.merge(article_metadata, on='article_id', how='left')

In [None]:
# Get top 5 popular articles per metadata group
top_5_by_metadata = {}

for col in metadata_columns:
    top_items = (
        article_popularity.groupby(col)
        .apply(lambda x: x.sort_values(by='purchase_count', ascending=False).head(5))
        .reset_index(drop=True)[[col, 'article_id']]
    )
    top_5_by_metadata[col] = top_items

# Example output: top 5 articles per 'product_type_name'
print("Top 5 by product_type_name:")
print(top_5_by_metadata['product_type_name'])



In [None]:
for col in metadata_columns:
    df_top = top_5_by_metadata[col]

    print(f"\n🔹 Insights for {col.replace('_', ' ').title()}:")
    print(f"  Number of unique article_ids in top 5 per group: {df_top['article_id'].nunique()}")

    # Top 3 metadata values with most frequent top articles
    top_values = df_top[col].value_counts().head(3)
    for val, count in top_values.items():
        print(f"   - '{val}' appeared in top 5 list of {count} groups")

    # Show which articles dominate a category
    most_frequent_articles = df_top['article_id'].value_counts().head(3)
    for art_id, freq in most_frequent_articles.items():
        print(f"   - Article ID {art_id} appeared in top 5 of {freq} different {col} groups")


In [None]:
# Categories to analyze
selected_categories = ['product_type_name', 'product_group_name', 'section_name', 'garment_group_name']

# Merge all top 5 metadata DataFrames to find global top colour values
all_colour_data = pd.DataFrame()
for cat in selected_categories:
    df_cat = top_5_by_metadata[cat].merge(
        article_metadata[['article_id', 'colour_group_name']],
        on='article_id', how='left'
    )
    df_cat['category'] = cat
    all_colour_data = pd.concat([all_colour_data, df_cat], ignore_index=True)

# Get unique colour groups (sorted alphabetically for consistent color mapping)
unique_colours = sorted(all_colour_data['colour_group_name'].dropna().unique())

# Create a fixed color palette
palette = sns.color_palette("Set2", n_colors=len(unique_colours))
colour_mapping = dict(zip(unique_colours, palette))

# Set seaborn style
sns.set(style="whitegrid")

# Plot per category
for cat in selected_categories:
    df_plot = top_5_by_metadata[cat].merge(
        article_metadata[['article_id', 'colour_group_name']],
        on='article_id', how='left'
    )

    # Top 5 values in the category (by count)
    top5_vals = df_plot[cat].value_counts().nlargest(5).index
    df_plot = df_plot[df_plot[cat].isin(top5_vals)]

    # Plot
    plt.figure(figsize=(10, 6))
    sns.countplot(
        data=df_plot,
        x=cat,
        hue='colour_group_name',
        palette=colour_mapping,
        order=top5_vals
    )

    plt.title(f"🎨 Colour Popularity in Top 5 {cat.replace('_', ' ').title()}s")
    plt.xlabel(cat.replace('_', ' ').title())
    plt.ylabel("Count (Top Articles)")
    plt.xticks(rotation=30, ha='right')
    plt.legend(title='Colour Group', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()



In [None]:
last_3_months = last_3_months.merge(df_customer,on='customer_id',how='inner')

In [None]:
last_3_months.head()

In [None]:
last_3_months.columns

In [None]:
# Create age group bins
age_bins = [0, 10, 20, 30, 40, 50, 200]
age_labels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50+']
last_3_months['age_group'] = pd.cut(last_3_months['age'], bins=age_bins, labels=age_labels, right=False)



# Count article purchases per age group
article_counts = (
    last_3_months.groupby(['age_group', 'article_id'])
    .size()
    .reset_index(name='purchase_count')
)

# Get Top 5 articles in each age group
top5_per_age_group = (
    article_counts.sort_values(['age_group', 'purchase_count'], ascending=[True, False])
    .groupby('age_group')
    .head(5)
    .reset_index(drop=True)
)

print(top5_per_age_group)

In [None]:
top5_with_meta = top5_per_age_group.merge(
    last_3_months.drop_duplicates('article_id')[
        ['article_id', 'product_type_name', 'colour_group_name']
    ],
    on='article_id',
    how='left'
)

In [None]:
top5_with_meta.head()

In [None]:
# Set Seaborn style
sns.set(style="whitegrid")

# Plot: One chart per age group
age_groups = top5_with_meta['age_group'].unique()

for age in age_groups:
    # Filter data for the age group
    df_age = top5_with_meta[top5_with_meta['age_group'] == age]

    # Group by product type and sum purchase counts
    top_types = (
        df_age.groupby('product_type_name')['purchase_count']
        .sum()
        .sort_values(ascending=False)
        .head(5)
        .reset_index()
    )

    # Plot
    plt.figure(figsize=(8, 5))
    sns.barplot(
        data=top_types,
        x='purchase_count',
        y='product_type_name',
        palette='coolwarm'
    )

    plt.title(f"Top Product Types in Age Group {age}")
    plt.xlabel("Total Purchases")
    plt.ylabel("Product Type")
    plt.tight_layout()
    plt.show()


In [None]:
# Set style
sns.set(style="whitegrid")

# Unique age groups
age_groups = top5_with_meta['age_group'].unique()

for age in age_groups:
    # Filter data for this age group
    df_age = top5_with_meta[top5_with_meta['age_group'] == age]

    # Group by colour and sum purchases
    top_colours = (
        df_age.groupby('colour_group_name')['purchase_count']
        .sum()
        .sort_values(ascending=False)
        .head(5)
        .reset_index()
    )

    # Plot
    plt.figure(figsize=(8, 5))
    sns.barplot(
        data=top_colours,
        x='purchase_count',
        y='colour_group_name',
        palette='Set2'  # You can replace with your real_colour_mapping if needed
    )

    plt.title(f"Top Colours in Age Group {age}")
    plt.xlabel("Total Purchases")
    plt.ylabel("Colour Group")
    plt.tight_layout()
    plt.show()


In [None]:
# Function to assign season based on month
def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:  # months 12, 1, 2
        return 'Winter'

# Apply function
df['season'] = df['t_dat'].apply(get_season)

In [None]:
df.columns

In [None]:
# Group by season and product type, summing purchases
season_product_counts = (
    df.groupby(['season', 'product_type_name'])
    .size()
    .reset_index(name='purchase_count')
)

# Get top product type for each season
top_product_per_season = (
    season_product_counts
    .sort_values(['season', 'purchase_count'], ascending=[True, False])
    .groupby('season')
    .head(1)
    .reset_index(drop=True)
)

print(top_product_per_season)


In [None]:
# Group and count purchases per product type per season
season_product_counts = (
    df.groupby(['season', 'product_type_name'])
    .size()
    .reset_index(name='purchase_count')
)

# Sort and get top 5 product types per season
top5_products_per_season = (
    season_product_counts
    .sort_values(['season', 'purchase_count'], ascending=[True, False])
    .groupby('season')
    .head(5)
    .reset_index(drop=True)
)
g = sns.catplot(
    data=top5_products_per_season,
    kind="bar",
    x="purchase_count",
    y="product_type_name",
    col="season",
    col_wrap=2,
    sharex=False,
    height=4,
    palette="Set2"
)

g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Top 5 Product Types per Season")
g.set_axis_labels("Purchases", "Product Type")
plt.show()


In [None]:
# Sort by date to maintain time order
df = df.sort_values('t_dat')

# Step 2: Split into train (e.g. 80%) and test (20%) based on time
cutoff_date = df['t_dat'].quantile(0.8)
train_df = df[df['t_dat'] <= cutoff_date]
test_df = df[df['t_dat'] > cutoff_date]

In [None]:
# Count number of purchases per article
article_popularity = (
    train_df.groupby('article_id')
    .size()
    .reset_index(name='purchase_count')
    .sort_values('purchase_count', ascending=False)
)

# Top N popular articles
top_n = 10
top_articles = article_popularity['article_id'].head(top_n).tolist()


In [None]:
# Unique test users
test_users = test_df['customer_id'].unique()

# Create recommendations for each test user
recommendations = pd.DataFrame({
    'customer_id': test_users.repeat(top_n),
    'predicted_article_id': top_articles * len(test_users)
})


In [None]:
recommendations

In [None]:
# Actual purchases in test set
actual_purchases = test_df[['customer_id', 'article_id']].drop_duplicates()

# Merge with recommendations to check hits
hits = pd.merge(recommendations, actual_purchases, 
                left_on=['customer_id', 'predicted_article_id'], 
                right_on=['customer_id', 'article_id'], 
                how='inner')

# Precision@10
precision = hits.groupby('customer_id').size().mean() / top_n
print(f"Precision@{top_n}: {precision:.4f}")
