## BASELINE MODELLING

In [2]:
!pip install tqdm
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd
from tqdm import tqdm
from collections import Counter
tqdm.pandas()

/kaggle/input/santander-recsys/sample_submission.csv
/kaggle/input/santander-recsys/first_acquisition_df.csv
/kaggle/input/santander-recsys/test_ver2.csv
/kaggle/input/santander-recsys/train_ver2.csv


## Load the datasets

In [3]:
test_path = "/kaggle/input/santander-recsys/test_ver2.csv"
train_path = "/kaggle/input/santander-recsys/train_ver2.csv"
sample_path = "/kaggle/input/santander-recsys/sample_submission.csv"

In [5]:
train_data = pd.read_csv(train_path)
print(f"Training dataset loaded. Shape : {train_data.shape}")

Training dataset loaded. Shape : (13647309, 48)


In [None]:
test_data = pd.read_csv(test_path)
print(f"Testing dataset loaded. Shape : {test_data.shape}")

In [6]:
sample_data = pd.read_csv(sample_path)
print(f"Sample dataset loaded. Shape : {sample_data.shape}")

Sample dataset loaded. Shape : (929615, 2)


In [7]:
def get_all_products():
    product_cols = [col for col in train_data.columns if col.startswith('ind_') and col.endswith('_ult1')]  # Adjust to actual product column names
    return product_cols
    print(f"Total products : {len(product_cols)}")

## Evaluation function - MAP@7

In [None]:
def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual_list, predicted_list, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actual_list, predicted_list)])

## Baseline 1 - Popularity based recommendation

- Recommending the most popular products (7) to the users

### Splitting dataset into train and eval


1. The cutoff date (`training_cutoff`) for splitting is set as 2016-04-28
2. Data before `training_cutoff` is taken as training data and data after this is eval data

In [None]:
training_cutoff = train_data['fecha_dato'].sort_values().unique()[-2]

train_df = train_data[train_data['fecha_dato'] <= training_cutoff]
eval_df = train_data[train_data['fecha_dato'] > training_cutoff]

print(f"Cutoff date : {training_cutoff}")
print(f"Train data shape : {train_df.shape}")
print(f"Eval data shape : {eval_df.shape}")

### Finding most popular products

1. For all the customers, find the product which have been ever owned (1 if owned atleast once, else 0)
2. Sum it for all the products across each customer, this will give us the most popular products

In [None]:
product_cols = get_all_products()
print(f"Total products : {len(product_cols)}\n")
columns_to_use = ['ncodpers'] + product_cols

train_df_subset = train_df[columns_to_use]
print(f"Training data shape after selecting product columns and customer id : {train_df_subset.shape}\n")

# 1. Aggregate to ever-owned per user (1 if ever owned in history, else 0)
ever_owned_products = train_df_subset.groupby('ncodpers')[product_cols].max()

# 2. Sum columns to get total number of customers who ever owned each product
product_popularity = ever_owned_products.sum(axis=0).sort_values(ascending=False)
top_7_products = product_popularity.nlargest(7).index.tolist()
print(f"Top 7 Product: \n{', '.join(top_7_products)}")

### `get_owned_and_popular_products()`

- This function returns the products owned atleast once by the customer (`ever_owned_products`) and the most popular products (`top_products`)

In [None]:
def get_owned_and_popular_products():
    columns_to_use = ['ncodpers'] + product_cols
    train_df = train_data[columns_to_use]
    ever_owned_products = train_df.groupby('ncodpers')[product_cols].max()
    product_popularity = ever_owned_products.sum(axis=0).sort_values(ascending=False)
    top_products = product_popularity.index.tolist()
    return ever_owned_products, top_products

### `recommend_for_user(user_id, top_n=7)`

- Takes a customer id as input and return the top products never owned by the customer as the recommendation (default `top_n` = 7)

In [None]:
def recommend_for_user(user_id, top_n=7):
    owned_products = user_owned[user_id]
    recs = [p for p in top_products if p not in owned_products][:top_n]
    recs = ' '.join(recs)
    return recs

### Building a dictionary of products each user has ever owned

- Loops over all users in `ever_owned_products` and creates a dictionary where each `uid` maps to a **set of products** they have ever owned (i.e., products marked with `1`).

In [None]:
ever_owned_products, top_products = get_owned_and_popular_products()
top_products = product_popularity.index.tolist()
user_owned = {
    uid: set(ever_owned_products.loc[uid][ever_owned_products.loc[uid] == 1].index)
    for uid in ever_owned_products.index
}

### Baseline predictions on evaluation set

- Blindly predicting the top 7 recommendation from most popular products

In [None]:
# Get unique customers in May 2016
eval_users = eval_df['ncodpers'].unique()
print(f"total users in eval set: {len(eval_users)}")

# Predict same top 7 for all
baseline_eval_preds = pd.DataFrame({
    'ncodpers': eval_users,
    'added_products': [top_7_products] * len(eval_users)
})

### Eval Ground Truth

* Constructing the ground truth product ownership for evaluation users.
* `eval_truth` is a list of lists, where each inner list contains the products owned by a customer as per `eval_df`.

In [None]:
# Ground truth from eval_df
eval_truth = []
for _, row in eval_df.groupby('ncodpers'):
    added = [prod for prod in product_cols if row[prod].values[0] == 1]
    eval_truth.append(added)

### Computing MAP@7 for baseline evaluation data

In [None]:
map_score_eval = mapk(eval_truth, baseline_eval_preds['added_products'].tolist())
print(f"Baseline 1 MAP@7 (Eval data): {map_score_eval:.4f}")

### Test Set Recommendations
- For each user in the test set, generating recommendation using `recommend_for_user()`

In [None]:
sample_data['added_products'] = sample_data['ncodpers'].map(recommend_for_user)

### Error Analysis

- We have evaluation ground truth data `eval_truth` and evaluation predictions `eval_preds`
- Using these data, we will compute the **False Positives** and **False Negatives**

In [None]:
fp_counter,fn_counter = Counter(), Counter()

eval_preds = baseline_eval_preds['added_products'].tolist()

for true_prod, pred_prod in zip(eval_truth, eval_preds):
    true_set = set(true_prod)
    pred_set = set(pred_prod)

    # False Negatives: missed actual buys
    fn = true_set - pred_set
    fn_counter.update(fn)

    # False Positives: predicted but not actually bought
    fp = pred_set - true_set
    fp_counter.update(fp)

### Count of Top 10 False Negatives

In [None]:
print("Top 10 false negatives (bought but not predicted):")
for product, count in fn_counter.most_common(10):
    print(f"{product} : {count}")

### Count of Top 7 False Positives

In [None]:
# Since we've predicted 7 fixed items per user
print("Top 7 false positives (predicted but not bought):")
for product, count in fp_counter.most_common():
    print(f"{product} : {count}")

In [None]:
sample_data.to_csv("Baseline1_test_preds.csv", index=False)

## Baseline 2 - Popularity Based on Segments

* Customer attributes such as `age`, `nomprov`, `segmento`, and `renta` are binned into discrete buckets.
* These buckets define customer segments, and recommendations are made by suggesting the most popular products within the customer’s segment.

### Basic data preprcessing steps: 
1. Replace '   NA' in `age` with np.nan
2. Convert age to numeric type
3. Find ids where all 3 - `renta`, `nomprov` and `age` are nan
4. Drop records belongning to above ids

In [None]:
# Generic preprocessing steps: 

# 1. Replace ' NA' in age with np.nan
train_data['age'] = train_data['age'].replace(' NA', np.nan)

# 2. Convert age column to numeric type
train_data['age'] = pd.to_numeric(train_data['age'], errors='coerce')

# 3. Select customer IDs with all 3 key fields missing (renta, nomprov, age)
ids_with_nan =  train_data[train_data['renta'].isna() & 
                train_data['nomprov'].isna() & 
                train_data['age'].isna()]['ncodpers']
print(f"Total customers with no age, renta and nomprov record : {len(set(ids_with_nan))}")

# 4. Dropping customers with no age, renta and nomprov record
df_train = train_data[~(train_data['age'].isna() & train_data['nomprov'].isna() & train_data['renta'].isna())]
print(f"Shape of data after removing customers with no age, nomprov and renta : {df_train.shape}")


# Sanity Check - Number of records after dropping should be equal to total records - len(ids_with_nan)
assert df_train.shape[0] == (train_data.shape[0] - len(ids_with_nan))

### Calculating 33rd and 67th percentile of renta for bucketing

In [None]:
q_33 = df_train['renta'].quantile(0.33)
print(f"33rd percentile of renta : {q_33}")
q_67 = df_train['renta'].quantile(0.67)
print(f"67th percentile of renta : {q_67}")

### Functions to bucketize the values

In [None]:
# Function to bucketize age into ranges
def age_bucket(age):
    if pd.isna(age):          # If age is missing, return "UNK"
        return "UNK"
    elif age < 18:            # Below 18
        return "0-17"
    elif age < 30:            # 18 to 29
        return "18-29"
    elif age < 45:            # 30 to 44
        return "30-44"
    elif age < 60:            # 45 to 59
        return "45-59"
    else:                     # 60 and above
        return "60+"

# Function to bucketize income into categories
def income_bucket(income):
    if pd.isna(income):       # If income is missing
        return "UNK"
    elif income <= 80000:     # Low income
        return "L"
    elif income <= 133700:    # Medium income
        return "M"
    else:                     # High income
        return "H"

# Function to bucketize location (province)
def nomprov_bucket(loc):
    if pd.isna(loc):          # If location is missing
        return "UNK"
    else:                     # Otherwise, return the location itself
        return loc

# Function to bucketize gender
def sexo_bucket(gdr):
    if pd.isna(gdr):          # If gender is missing
        return "UNK"
    else:                     # Otherwise, return gender itself
        return gdr

# Function to bucketize customer segment
def segmento_bucket(seg):
    if pd.isna(seg):          # If segment is missing
        return "UNK"
        
    if seg == "01 - TOP":     # Top segment
        return "TOP"
        
    if seg == "02 - PARTICULARES":  # Individual customers
        return "PARTICULARES"
        
    if seg == "03 - UNIVERSITARIO": # University students
        return "UNIVERSITARIO"

### Performing bucketzation and creating segments
1. `bucket_functions` is a dict which contain features to consider for bucketing and the corresponding function to implement
2. Apply bucketization for all features and store it in `<feature>_bucket` column
3. Create segments in format `<age_bucket>_<nomprov_bucket>_<renta_bucket>_<segmento_bucket>`

In [None]:
# Mapping of columns to their respective bucketization functions
bucket_functions = {
    'age': age_bucket,
    'renta': income_bucket,
    'nomprov': nomprov_bucket,
    'segmento': segmento_bucket,
    'sexo': sexo_bucket
}

# Apply bucketization for each column
for col, func in bucket_functions.items():
    df_train[f'{col}_bucket'] = df_train[col].apply(func)


df_train['segment'] = (
    df_train['age_bucket'] + "_" +
    df_train['nomprov_bucket'].astype(str) + "_" +
    df_train['renta_bucket'] + "_" +
    df_train['segmento_bucket'].astype(str)
)

### Splitting dataset into train and eval


1. The cutoff date (`training_cutoff`) for splitting is set as 2016-04-28
2. Data before `training_cutoff` is taken as training data and data after this is eval data

In [None]:
product_cols = get_all_products()

# Splitting into train and eval set. The last month's data is kept as evaluation dataset

training_cutoff = df_train['fecha_dato'].sort_values().unique()[-2]

train_df = df_train[df_train['fecha_dato'] <= training_cutoff]
eval_df = df_train[df_train['fecha_dato'] > training_cutoff]

print(f"Cutoff date : {training_cutoff}")
print(f"Data before {training_cutoff} is for training and data after is for evaluation")
print(f"Train data shape : {train_df.shape}")
print(f"Eval data shape : {eval_df.shape}")

### Finding most popular products

1. For all the customers, find the product which have been ever owned (1 if owned atleast once, else 0) (`ever_owned`)
2. Get the corresponding segment from `df_train` and merge both dataframes
3. Group the merged dataframe `ever_owned` based on `segment` and find top products for each segment

In [None]:
ever_owned = train_df.groupby('ncodpers')[product_cols].max()

segment_info = df_train[['ncodpers', 'segment']].drop_duplicates(subset='ncodpers').set_index('ncodpers')
ever_owned = ever_owned.merge(segment_info, left_index=True, right_index=True)


segment_popularity = ever_owned.groupby('segment')[product_cols].sum()

# Storing the top products for each segment in segment_top_products
segment_top_products = {
    segment: segment_popularity.loc[segment].sort_values(ascending=False).index.tolist()
    for segment in segment_popularity.index
}

### Top Products Across All Customers

* `top_products` stores the most frequently owned products across the entire data
* Acts as a fallback recommendation list for cases where a customer ID is missing or not found during evaluation

In [None]:
# Finding top products across all users. This will be used for cases if userid is not present in the ever_owned dict

columns_to_use = ['ncodpers'] + product_cols
train_df_gbl = train_df[columns_to_use]
ever_owned_products = train_df_gbl.groupby('ncodpers')[product_cols].max()
product_popularity = ever_owned_products.sum(axis=0).sort_values(ascending=False)
top_products = product_popularity.index.tolist()
del train_df_gbl

### Recommending on evaluation data
- `baseline2_recommend_for_user(..)` - Takes the user id and returns top product recommendations which have not been owned by user.

In [None]:
def baseline2_recommend_for_user(user_id, top_n=7):
    # Case 1: If user is not in the dataset, return global top products
    if user_id not in ever_owned.index:
        return top_products[:top_n]
        
    # Case 2: If user exists, recommend based on their segment       
    user_data = ever_owned.loc[user_id]
    
    # Identify the segment the user belongs to
    user_segment = user_data['segment']
    
    # Get the set of products already owned by the user
    owned_products = set(user_data[product_cols][user_data[product_cols] == 1].index)
    
    # Fetch popular products for this segment
    popular_products = segment_top_products.get(user_segment, [])

    # Filter out products the user already owns
    recs = [p for p in popular_products if p not in owned_products]

    # Return top 7 recommendations
    return recs[:top_n]

### Evaluation Predictions

In [None]:
# Eval predictions

# Get unique customers in May 2016
eval_users = eval_df['ncodpers'].unique()
print(f"total users in eval set: {len(eval_users)}")

baseline_eval_preds = pd.DataFrame({
    "ncodpers": eval_users
})

baseline_eval_preds["added_products"] = baseline_eval_preds["ncodpers"].map(lambda uid: baseline2_recommend_for_user(uid, top_n=7))

### Ground truth for evaluation predictions

In [None]:
# Ground truth from eval_df
eval_actual = []
for _, row in eval_df.groupby('ncodpers'):
    added = [prod for prod in product_cols if row[prod].values[0] == 1]
    eval_actual.append(added)

### Computing MAP@7 for baseline evaluation data

In [None]:
map_score_eval = mapk(eval_actual, baseline_eval_preds['added_products'].tolist())
print(f"Baseline 2 MAP@7 (Eval data): {map_score_eval:.4f}")

### Error Analysis

- We have evaluation ground truth data `eval_actual` and evaluation predictions `eval_preds`
- Using these data, we will compute the **False Positives** and **False Negatives**

In [None]:
fp_counter,fn_counter = Counter(), Counter()

eval_preds = baseline_eval_preds['added_products'].tolist()

for true_prod, pred_prod in zip(eval_actual, eval_preds):
    true_set = set(true_prod)
    pred_set = set(pred_prod)

    # False Negatives: missed actual buys
    fn = true_set - pred_set
    fn_counter.update(fn)

    # False Positives: predicted but not actually bought
    fp = pred_set - true_set
    fp_counter.update(fp)

### Top 10 False Negatives

In [None]:
print("Top 10 false negatives (bought but not predicted):")
for product, count in fn_counter.most_common(10):
    print(f"{product} : {count}")

### Top 10 False Positives

In [None]:
print("Top 10 false positives (predicted but not bought):")
for product, count in fp_counter.most_common(10):
    print(f"{product} : {count}")

### Recommendation on the Test Set

In [None]:
# 5. Define recommendation function per user for test set

ever_owned = df_train.groupby('ncodpers')[product_cols].max()
segment_info = df_train[['ncodpers', 'segment']].drop_duplicates(subset='ncodpers').set_index('ncodpers')
ever_owned = ever_owned.merge(segment_info, left_index=True, right_index=True)

def recommend_for_user(user_id, top_n=7):
    if user_id not in ever_owned.index:
        print(f"User not found: {user_id}")
        return []
    user_data = ever_owned.loc[user_id]
    user_segment = user_data['segment']
    owned_products = set(user_data[product_cols][user_data[product_cols] == 1].index)
    popular_products = segment_top_products.get(user_segment, [])
    recs = [p for p in popular_products if p not in owned_products]
    return recs[:top_n]

### Batch Prediction

- Implemented to prevent kernel crash

In [None]:
batch_size = 400000
n = len(sample_data)
all_recs = []

for start in range(0, n, batch_size):
    end = start + batch_size
    print(f"Recommending for {start} to {end}")
    batch = sample_data.iloc[start:end]
    recs = batch['ncodpers'].progress_apply(lambda uid: " ".join(recommend_for_user(uid, top_n=5)))
    all_recs.extend(recs)

sample_data['added_products'] = all_recs

In [None]:
sample_data.to_csv("Baseline2_test_preds.csv", index=False)

## Baseline 3 - Popularity Based on Segments

* Customer attributes such as `age`, `nomprov`, `segmento`, `renta` and `antiguedad` are binned into discrete buckets.
* These buckets define customer segments, and recommendations are made by suggesting the most popular products within the customer’s segment.

### Basic data preprcessing steps: 
1. Replace '   NA' in `age` with np.nan
2. Replace '   NA' in `antiguedad` with np.nan, trim spaces and convert to numeric type
3. Find ids where all 5 - `renta`, `nomprov`, `age`, `sexo` and `antiguedad` are nan
4. Drop records belongning to above ids

In [None]:
# Generic pre processing steps

# 1. Clean 'age' by handling 'NA' values and converting to numeric
train_data['age'] = pd.to_numeric(train_data['age'].replace(' NA', np.nan), errors='coerce')

# 2. Clean 'antiguedad' by handling 'NA', stripping spaces, and converting to numeric
train_data['antiguedad'] = pd.to_numeric(train_data['antiguedad'].replace('     NA', np.nan).astype(str).str.strip(), errors='coerce')

# 3. Select customer IDs with all 5 key fields missing (renta, nomprov, age, sexo, antiguedad)
ids_with_nan =  train_data[train_data['renta'].isna() & 
                train_data['nomprov'].isna() & 
                train_data['age'].isna() &
                train_data['sexo'].isna() & 
                train_data['antiguedad'].isna()]['ncodpers']

print(f"Total customers with no age, renta, sexo, antiguedad and nomprov record : {len(set(ids_with_nan))}")

df_train = train_data[~(train_data['age'].isna() & train_data['nomprov'].isna() & train_data['renta'].isna() & train_data['sexo'].isna() & train_data['antiguedad'].isna())]
print(f"Shape of data after removing customers with no age, nomprov, renta, sexo and antiguedad : {df_train.shape}")

# Sanity Check - Number of records after dropping should be equal to total records - len(ids_with_nan)
assert df_train.shape[0] == (train_data.shape[0] - len(ids_with_nan))

### Calculating 33rd and 67th percentile of renta for bucketing

In [None]:
q_33 = df_train['renta'].quantile(0.33)
print(f"33rd percentile of renta : {q_33}")
q_67 = df_train['renta'].quantile(0.67)
print(f"67th percentile of renta : {q_67}")

In [None]:
# Function to bucketize age into ranges
def age_bucket(age):
    if pd.isna(age):          # If age is missing, return "UNK"
        return "UNK"
    elif age < 18:            # Below 18
        return "0-17"
    elif age < 30:            # 18 to 29
        return "18-29"
    elif age < 45:            # 30 to 44
        return "30-44"
    elif age < 60:            # 45 to 59
        return "45-59"
    else:                     # 60 and above
        return "60+"

# Function to bucketize income into categories
def income_bucket(income):
    if pd.isna(income):       # If income is missing
        return "UNK"
    elif income <= 80000:     # Low income
        return "L"
    elif income <= 133700:    # Medium income
        return "M"
    else:                     # High income
        return "H"

# Function to bucketize location (province)
def nomprov_bucket(loc):
    if pd.isna(loc):          # If location is missing
        return "UNK"
    else:                     # Otherwise, return the location itself
        return loc

# Function to bucketize gender
def sexo_bucket(gdr):
    if pd.isna(gdr):          # If gender is missing
        return "UNK"
    else:                     # Otherwise, return gender itself
        return gdr

# Function to bucketize customer segment
def segmento_bucket(seg):
    if pd.isna(seg):          # If segment is missing
        return "UNK"
        
    if seg == "01 - TOP":     # Top segment
        return "TOP"
        
    if seg == "02 - PARTICULARES":  # Individual customers
        return "PARTICULARES"
        
    if seg == "03 - UNIVERSITARIO": # University students
        return "UNIVERSITARIO"

def antiguedad_bucket(mths):
    if pd.isna(mths): # If months is missing
        return "UNK"
    if mths <= 60: # Less than or equal to 60 months -> Young customer
        return "Y"
    if mths > 60 and mths <= 180: # More than 60 and less than 180 months -> Mid customer
        return "M"
    if mths > 180: # More than 180 months -> Old customer
        return "O"


### Performing bucketzation and creating segments
1. `bucket_functions` is a dict which contain features to consider for bucketing and the corresponding function to implement
2. Apply bucketization for all features and store it in `<feature>_bucket` column
3. Create segments in format `<age_bucket>_<nomprov_bucket>_<sexo_bucket>_<renta_bucket>_<antiguedad_bucket>_<segmento_bucket>`

In [None]:
# Mapping of columns to their bucketization functions
bucket_functions = {
    'age': age_bucket,
    'renta': income_bucket,
    'nomprov': nomprov_bucket,
    'segmento': segmento_bucket,
    'sexo': sexo_bucket,          # fixed here
    'antiguedad': antiguedad_bucket
}

# Apply bucketization dynamically
for col, func in bucket_functions.items():
    df_train[f'{col}_bucket'] = df_train[col].apply(func)


df_train['segment'] = (
    df_train['age_bucket'] + "_" +
    df_train['nomprov_bucket'].astype(str) + "_" +
    df_train['sexo_bucket'].astype(str) + "_" +
    df_train['renta_bucket'] + "_" +
    df_train['antiguedad_bucket'] + "_" +
    df_train['segmento_bucket'].astype(str)
)

### Splitting dataset into train and eval


1. The cutoff date (`training_cutoff`) for splitting is set as 2016-04-28
2. Data before `training_cutoff` is taken as training data and data after this is eval data

In [None]:
# Splitting into train and eval set. The last month's data is kept as evaluation dataset

training_cutoff = df_train['fecha_dato'].sort_values().unique()[-2]

train_df = df_train[df_train['fecha_dato'] <= training_cutoff]
eval_df = df_train[df_train['fecha_dato'] > training_cutoff]

print(f"Cutoff date : {training_cutoff}")
print(f"Data before {training_cutoff} is for training and data after is for evaluation")
print(f"Train data shape : {train_df.shape}")
print(f"Eval data shape : {eval_df.shape}")

### Finding most popular products

1. For all the customers, find the product which have been ever owned (1 if owned atleast once, else 0) (`ever_owned`)
2. Get the corresponding segment from `df_train` and merge both dataframes
3. Group the merged dataframe `ever_owned` based on `segment` and find top products for each segment

In [None]:
product_cols = get_all_products()
ever_owned = train_df.groupby('ncodpers')[product_cols].max()
segment_info = df_train[['ncodpers', 'segment']].drop_duplicates(subset='ncodpers').set_index('ncodpers')
ever_owned = ever_owned.merge(segment_info, left_index=True, right_index=True)

segment_popularity = ever_owned.groupby('segment')[product_cols].sum()

# Storing the top products for each segment in segment_top_products
segment_top_products = {
    segment: segment_popularity.loc[segment].sort_values(ascending=False).index.tolist()
    for segment in segment_popularity.index
}

print(f"Total segments : {len(segment_popularity)}")

### Top Products Across All Customers

* `top_products` stores the most frequently owned products across the entire data
* Acts as a fallback recommendation list for cases where a customer ID is missing or not found during evaluation

In [None]:
columns_to_use = ['ncodpers'] + product_cols
train_df_gbl = train_df[columns_to_use]
ever_owned_products = train_df_gbl.groupby('ncodpers')[product_cols].max()
product_popularity = ever_owned_products.sum(axis=0).sort_values(ascending=False)
top_products = product_popularity.index.tolist()
del train_df_gbl

### Recommending on evaluation data
- `baseline3_recommend_for_user(..)` - Takes the user id and returns top product recommendations which have not been owned by user.

In [None]:
def baseline3_recommend_for_eval_user(user_id, top_n=7):
    # If user id is not present, recommend the top 7 found by considering all users
    if user_id not in ever_owned.index:
        return top_products[:top_n]
    # If user id is present, find products owned by the user and it's segment and suggest top products in that segment never owned by the user       
    user_data = ever_owned.loc[user_id]
    user_segment = user_data['segment']
    owned_products = set(user_data[product_cols][user_data[product_cols] == 1].index)
    popular_products = segment_top_products.get(user_segment, [])
    recs = [p for p in popular_products if p not in owned_products]
    return recs[:top_n]

### Evaluation Predictions

In [None]:
# Eval predictions

# Get unique customers in May 2016
eval_users = eval_df['ncodpers'].unique()
print(f"total users in eval set: {len(eval_users)}")

baseline_eval_preds = pd.DataFrame({
    "ncodpers": eval_users
})

baseline_eval_preds["added_products"] = baseline_eval_preds["ncodpers"].map(lambda uid: baseline3_recommend_for_eval_user(uid, top_n=7))

### Ground truth for evaluation predictions

In [None]:
# Ground truth from eval_df
eval_actual = []
for _, row in eval_df.groupby('ncodpers'):
    added = [prod for prod in product_cols if row[prod].values[0] == 1]
    eval_actual.append(added)

### Computing MAP@7 for baseline evaluation data

In [None]:
map_score_eval = mapk(eval_actual, baseline_eval_preds['added_products'].tolist())
print(f"Baseline 3 MAP@7 (Eval data): {map_score_eval:.4f}")

### Error Analysis

- We have evaluation ground truth data `eval_actual` and evaluation predictions `eval_preds`
- Using these data, we will compute the **False Positives** and **False Negatives**

In [None]:
fp_counter,fn_counter = Counter(), Counter()

eval_preds = baseline_eval_preds['added_products'].tolist()

for true_prod, pred_prod in zip(eval_actual, eval_preds):
    true_set = set(true_prod)
    pred_set = set(pred_prod)

    # False Negatives: missed actual buys
    fn = true_set - pred_set
    fn_counter.update(fn)

    # False Positives: predicted but not actually bought
    fp = pred_set - true_set
    fp_counter.update(fp)

### Top 10 False Negatives

In [None]:
print("Top 10 false negatives (bought but not predicted):")
for product, count in fn_counter.most_common(10):
    print(f"{product} : {count}")

### Top 10 False Positives

In [None]:
print("Top 10 false positives (predicted but not bought):")
for product, count in fp_counter.most_common(10):
    print(f"{product} : {count}")

### Recommendation on the Test Set

In [None]:
# Define recommendation function per user
ever_owned = df_train.groupby('ncodpers')[product_cols].max()
segment_info = df_train[['ncodpers', 'segment']].drop_duplicates(subset='ncodpers').set_index('ncodpers')
ever_owned = ever_owned.merge(segment_info, left_index=True, right_index=True)


def recommend_for_user(user_id, top_n=7):
    if user_id not in ever_owned.index:
        print(f"User not found: {user_id}")
        return []
    user_data = ever_owned.loc[user_id]
    user_segment = user_data['segment']
    owned_products = set(user_data[product_cols][user_data[product_cols] == 1].index)
    popular_products = segment_top_products.get(user_segment, [])
    recs = [p for p in popular_products if p not in owned_products]
    return recs[:top_n]

### Batch Prediction

- Implemented to prevent kernel crash

In [None]:
batch_size = 400000  
n = len(sample_data)
all_recs = []

for start in range(0, n, batch_size):
    end = start + batch_size
    print(f"Recommending for {start} to {end}")
    batch = sample_data.iloc[start:end]
    recs = batch['ncodpers'].progress_apply(lambda uid: " ".join(recommend_for_user(uid, top_n=7)))
    all_recs.extend(recs)

sample_data['added_products'] = all_recs

In [None]:
sample_data.to_csv("Baseline3_test_preds.csv",index=False)

## Seasonality

## Collaborative filtering - Exploration

In [None]:
# from scipy.sparse import csr_matrix
# from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# train_set = train_set[product_cols].fillna(0)
# product_mtx = csr_matrix(train_set[product_cols].values)

In [8]:
product_cols = get_all_products()

In [9]:
data = train_data[train_data['fecha_dato'] == "2016-04-28"]
data.shape

(928274, 48)

In [10]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

    
user_item_matrix = data[product_cols]

# Compute item-item similarity (24 x 24)
item_sim = cosine_similarity(user_item_matrix.T)
item_sim_df = pd.DataFrame(item_sim, index=product_cols, columns=product_cols)

In [26]:
user_products = data[data['ncodpers']==896849][product_cols].iloc[0]

In [31]:
#user_products
scores = item_sim_df.dot(user_products)
#scores

In [32]:
scores = scores * (1 - user_products)

In [33]:
scores.sort_values()

ind_ctma_fin_ult1    0.000000
ind_cno_fin_ult1     0.000000
ind_ecue_fin_ult1    0.000000
ind_tjcr_fin_ult1    0.000000
ind_ctju_fin_ult1    0.000042
ind_ahor_fin_ult1    0.011080
ind_aval_fin_ult1    0.032449
ind_deco_fin_ult1    0.040403
ind_cder_fin_ult1    0.041170
ind_pres_fin_ult1    0.044861
ind_deme_fin_ult1    0.065579
ind_viv_fin_ult1     0.111155
ind_ctop_fin_ult1    0.336907
ind_plan_fin_ult1    0.368979
ind_hip_fin_ult1     0.371715
ind_cco_fin_ult1     0.409931
ind_fond_fin_ult1    0.416722
ind_ctpp_fin_ult1    0.527205
ind_valo_fin_ult1    0.532790
ind_dela_fin_ult1    0.537754
ind_reca_fin_ult1    0.942148
ind_recibo_ult1      1.387193
ind_nomina_ult1      1.462698
ind_nom_pens_ult1    1.535896
dtype: float64

In [None]:
# Compute scores by multiplying similarity with user’s owned products
scores = item_sim_df.dot(user_products)

# Remove already owned products
scores = scores * (1 - user_products)

# Sort and get top-N
return scores.sort_values(ascending=False).head(top_n).index.tolist()


