In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

customers_df = pd.read_csv('data/Customers.csv')
products_df = pd.read_csv('data/Products.csv')
transactions_df = pd.read_csv('data/Transactions.csv')

In [3]:
combined_product_customer_transaction_df = pd.merge(
    pd.merge(transactions_df, customers_df, on='CustomerID', suffixes=('', '_transactions'), how='inner'),
    products_df, on='ProductID', how='inner', suffixes=('','_products')
)

In [5]:
temp_customer_df = combined_product_customer_transaction_df.groupby('CustomerID').apply(lambda x : x)
temp_customer_df = temp_customer_df.reset_index(drop=True)

  temp_customer_df = combined_product_customer_transaction_df.groupby('CustomerID').apply(lambda x : x)


In [24]:
# Feature processing - to better represent the data

# Count the number of transactions per category for each customer
category_counts = combined_product_customer_transaction_df.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)
category_counts.columns = [f'Category_count_{col}' for col in category_counts.columns]
category_counts = category_counts.reset_index()

# Count the number of transactions per quantity for each customer
quantity_counts = combined_product_customer_transaction_df.groupby(['CustomerID', 'Quantity']).size().unstack(fill_value=0)
quantity_counts.columns = [f'Quantity_count_{col}' for col in quantity_counts.columns]
quantity_counts = quantity_counts.reset_index()

# Merge the category and quantity counts with the main dataframe
merged_df = temp_customer_df.merge(category_counts, on="CustomerID").merge(quantity_counts, on="CustomerID")

merged_df['spent_per_item'] = merged_df['Price'] / merged_df['Quantity']
merged_df['average_spending_per_product'] = merged_df.groupby('CustomerID')['spent_per_item'].transform('mean')

merged_df['total_quantity'] = merged_df.groupby('CustomerID')['Quantity'].transform('sum')
merged_df['average_quantity'] = merged_df.groupby('CustomerID')['Quantity'].transform('mean')
merged_df['total_value'] = merged_df.groupby('CustomerID')['TotalValue'].transform('sum')

merged_df['SignupDate'] = pd.to_datetime(merged_df['SignupDate'])
merged_df['TransactionDate'] = pd.to_datetime(merged_df['TransactionDate'])
merged_df['Customer Since'] = (merged_df['TransactionDate'] - merged_df['SignupDate']).dt.days
merged_df['Customer Since'] = merged_df.groupby('CustomerID')['Customer Since'].transform('mean')

merged_df['TransactionDate'] = pd.to_datetime(merged_df['TransactionDate'])

# Create columns for each quarter of 2024
for quarter, (start_month, end_month) in enumerate([(1, 3), (4, 6), (7, 9), (10, 12)], start=1):
    quarter_col = f'transaction_value_2024_q{quarter}'
    merged_df[quarter_col] = merged_df.apply(
        lambda row: row['TotalValue'] if row['TransactionDate'].year == 2024 and start_month <= row['TransactionDate'].month <= end_month else 0, axis=1
    )

# Sum the values for all customers for each quarter
for quarter in range(1, 5):
    quarter_col = f'transaction_value_2024_q{quarter}'
    merged_df[quarter_col] = merged_df.groupby('CustomerID')[quarter_col].transform('sum')

# Perform one-hot encoding for the 'region column using 0/1
region_dummies = pd.get_dummies(merged_df['Region'], prefix='Region', drop_first=False, dtype=int)
merged_df = pd.concat([merged_df, region_dummies], axis=1)

In [25]:
# Drop unwanted cols and group the customer data 
merged_df.drop(['CustomerName', 'Price', 'ProductName', 'Category', 'Quantity', 'TransactionID', 'ProductID', 'TransactionDate', 'SignupDate','TotalValue', 'Price_products','spent_per_item','Region'], axis=1, inplace=True)
merged_df = merged_df.groupby('CustomerID').mean()
merged_df = merged_df.reset_index()

customer_ids = merged_df['CustomerID']
merged_df = merged_df.drop('CustomerID', axis=1)

KeyError: "['CustomerName', 'Price', 'ProductName', 'Category', 'Quantity', 'TransactionID', 'ProductID', 'TransactionDate', 'SignupDate', 'TotalValue', 'Price_products', 'spent_per_item', 'Region'] not found in axis"

In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
scaled_values = scaler.fit_transform(merged_df)
merged_df = pd.DataFrame(scaled_values, columns=merged_df.columns)

class LookAlike:
    def __init__(self, customer_data):
        self.customer_data = customer_data

    def cosine_similarity(self, vector_a, vector_b):
        dot_product = np.dot(vector_a, vector_b)
        norm_a = np.linalg.norm(vector_a)
        norm_b = np.linalg.norm(vector_b)
        if norm_a == 0 or norm_b == 0:
            return 0.0
        similarity = dot_product / (norm_a * norm_b)
        return similarity

    def get_top_three_similar_customers(self, customer_index):
        target_vector = self.customer_data[customer_index]
        similarity_scores = []

        for idx, vector in enumerate(self.customer_data):
            if idx != customer_index:
                similarity = self.cosine_similarity(target_vector, vector)
                similarity_scores.append((idx, similarity))

        similarity_scores.sort(key=lambda x: x[1], reverse=True)
        top_three_with_scores = similarity_scores[:3]
        return [idx for idx, _ in top_three_with_scores]


customer_data = merged_df.values
lookalike = LookAlike(customer_data)

lookalike_map = {
    customer_ids.iloc[customer_index]: [
        (customer_ids.iloc[idx], round(lookalike.cosine_similarity(customer_data[customer_index], customer_data[idx]), 2))
        for idx in lookalike.get_top_three_similar_customers(customer_index)
    ]
    for customer_index in range(min(20, len(customer_ids)))
}

In [32]:
lookalike_df = pd.DataFrame([
    {'CustomerID': key, 'SimilarCustomers': [(cust_id, float(score)) for cust_id, score in value]}
    for key, value in lookalike_map.items()
])
lookalike_df.to_csv('Lookalike.csv')

In [33]:
lookalike_map

{'C0001': [('C0120', np.float64(0.92)),
  ('C0188', np.float64(0.91)),
  ('C0174', np.float64(0.9))],
 'C0002': [('C0106', np.float64(0.95)),
  ('C0043', np.float64(0.94)),
  ('C0040', np.float64(0.91))],
 'C0003': [('C0129', np.float64(0.93)),
  ('C0039', np.float64(0.92)),
  ('C0163', np.float64(0.91))],
 'C0004': [('C0113', np.float64(0.94)),
  ('C0104', np.float64(0.93)),
  ('C0168', np.float64(0.92))],
 'C0005': [('C0007', np.float64(0.92)),
  ('C0043', np.float64(0.91)),
  ('C0186', np.float64(0.91))],
 'C0006': [('C0011', np.float64(0.94)),
  ('C0137', np.float64(0.92)),
  ('C0187', np.float64(0.91))],
 'C0007': [('C0140', np.float64(0.92)),
  ('C0005', np.float64(0.92)),
  ('C0040', np.float64(0.91))],
 'C0008': [('C0098', np.float64(0.94)),
  ('C0154', np.float64(0.89)),
  ('C0049', np.float64(0.88))],
 'C0009': [('C0119', np.float64(0.89)),
  ('C0103', np.float64(0.86)),
  ('C0198', np.float64(0.84))],
 'C0010': [('C0062', np.float64(0.94)),
  ('C0061', np.float64(0.91)),
  (