In [16]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import json

# Load data
customers = pd.read_csv('/content/drive/MyDrive/Zeotap/Customers.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Zeotap/Transactions.csv')

# Merge datasets
data = pd.merge(customers, transactions, on='CustomerID', how='left')

# Convert dates to datetime, handling NA values
data['SignupDate'] = pd.to_datetime(data['SignupDate'], errors='coerce')
data['TransactionDate'] = pd.to_datetime(data['TransactionDate'], errors='coerce')

# Handle missing values for date calculations
current_date = datetime.now()  # or any reference date

data['CustomerAge'] = (current_date - data['SignupDate']).dt.days
data['Recency'] = (current_date - data['TransactionDate']).dt.days

# Aggregate data per customer, handling NA values
agg_data = data.groupby('CustomerID').agg({
    'ProductID': lambda x: list(x.dropna()),  # List of products bought, remove NA
    'Recency': lambda x: x.min() if not x.dropna().empty else np.nan,  # Last transaction
    'TransactionDate': 'count',  # Frequency of transactions
    'Region': 'first',  # Assuming region doesn't change
    'CustomerAge': 'first'
}).reset_index().rename(columns={'TransactionDate': 'Frequency'})

# Handle missing values in numeric columns
numeric_features = ['CustomerAge', 'Recency', 'Frequency']
imputer = SimpleImputer(strategy='mean')  # or median or constant
agg_data[numeric_features] = imputer.fit_transform(agg_data[numeric_features])

# Vectorize product IDs
agg_data['ProductList'] = agg_data['ProductID'].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else '')
vectorizer = CountVectorizer()
product_matrix = vectorizer.fit_transform(agg_data['ProductList'])

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(agg_data[numeric_features])

# Combine with product features (sparse matrix)
from scipy.sparse import hstack
features = hstack([scaled_features, product_matrix])

# Compute similarity matrix
similarity_matrix = cosine_similarity(features)

def get_lookalikes(customer_id, similarity_matrix, n=2):
    # Index of the customer in the matrix
    idx = agg_data.index[agg_data['CustomerID'] == customer_id].tolist()[0]
    # Get similarity scores for this customer with all others
    sim_scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score, excluding self
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]  # Exclude self
    # Map back to customer IDs
    lookalikes = [(agg_data.iloc[i]['CustomerID'], score) for i, score in sim_scores if not np.isnan(score)]
    return lookalikes

# Generate lookalikes for the first 20 customers
lookalike_dict = {}
for customer_id in agg_data['CustomerID'].unique()[:20]:
    lookalike_dict[customer_id] = get_lookalikes(customer_id, similarity_matrix)

# Save results
with open('Lookalike.csv', 'w') as f:
    for key, value in lookalike_dict.items():
        f.write(f"{key},{json.dumps(value)}\n")