In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Load data
customers_df = pd.read_csv('https://drive.google.com/uc?export=download&id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE')
products_df = pd.read_csv('https://drive.google.com/uc?export=download&id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0')
transactions_df = pd.read_csv('https://drive.google.com/uc?export=download&id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF')

# Merge datasets to get complete data for each transaction
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
merged_df = pd.merge(merged_df, products_df, on='ProductID', how='left')

# Feature Engineering
# 1. Customer Profile: Region, SignupDate, CustomerName (encoded)
customer_profile = customers_df[['CustomerID', 'Region', 'SignupDate']].copy()  # Using copy() to avoid SettingWithCopyWarning
customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate']).astype(int) / 10**9  # Convert to timestamp

# Encode 'Region' to numeric values using LabelEncoder
le_region = LabelEncoder()
customer_profile['Region'] = le_region.fit_transform(customer_profile['Region'])

# 2. Transaction history: Product preferences (categories), TotalValue spent, etc.
transaction_history = merged_df.groupby('CustomerID').agg({
    'ProductID': lambda x: list(x),  # List of products bought
    'TotalValue': 'sum',             # Total value spent
    'Quantity': 'sum',               # Total quantity purchased
    'Category': lambda x: list(x),   # List of product categories bought
}).reset_index()

# Feature Scaling for numerical values (TotalValue, Quantity)
scaler = StandardScaler()
transaction_history[['TotalValue', 'Quantity']] = scaler.fit_transform(transaction_history[['TotalValue', 'Quantity']])

# Now we will calculate similarity scores based on both customer profile and transaction history.

# 3. Compute similarity using cosine similarity
# Combine customer profile and transaction history into one DataFrame for each customer
customer_data = pd.merge(customer_profile, transaction_history, on='CustomerID', how='left')

# Construct feature vectors
customer_vectors = customer_data[['Region', 'SignupDate', 'TotalValue', 'Quantity']]
customer_vectors = customer_vectors.fillna(0)  # Handle missing values

# Cosine Similarity: Similarity based on customer profile and transaction history
similarity_matrix = cosine_similarity(customer_vectors)

# Now, for each customer in the first 20 customers (C0001 - C0020), we will find the top 3 lookalikes
lookalike_map = {}

for customer_id in customer_data['CustomerID'][:20]:
    idx = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[idx]
    
    # Sort by similarity scores, excluding the customer itself
    sorted_similarities = sorted(enumerate(similarity_scores), key=lambda x: x[1], reverse=True)
    top_3_similar_customers = [(customer_data.iloc[i[0]]['CustomerID'], i[1]) for i in sorted_similarities[1:4]]
    
    lookalike_map[customer_id] = top_3_similar_customers

# Prepare the Lookalike.csv format with proper columns
lookalike_data = []
for customer_id, lookalikes in lookalike_map.items():
    for lookalike in lookalikes:
        lookalike_data.append({
            'CustomerID': customer_id,
            'LookalikeCustomerID': lookalike[0],
            'SimilarityScore': lookalike[1]
        })

lookalike_df = pd.DataFrame(lookalike_data)

# Output the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print the lookalike mapping for verification
print(lookalike_map)


{'C0001': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0002': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0003': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0004': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0005': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0006': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0007': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0008': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0009': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0010': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0011': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0012': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0013': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0014': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0015': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0016': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0)], 'C0017': [('C0002', 1.0), ('C0003', 1.0), ('C0004', 1.0