In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
import joblib

In [2]:
# Load the datasets
customers_path = 'Customers.csv'
products_path = 'Products.csv'
transactions_path = 'Transactions.csv'

In [3]:
customers_df = pd.read_csv(customers_path)
products_df = pd.read_csv(products_path)
transactions_df = pd.read_csv(transactions_path)

In [4]:
# Merge datasets to create a unified dataset
merged_df = transactions_df.merge(customers_df, on="CustomerID", how="left")
merged_df = merged_df.merge(products_df, on="ProductID", how="left")

In [5]:
# Aggregate transaction data to create customer-product interaction
customer_product_matrix = merged_df.groupby(['CustomerID', 'ProductID']).agg({
    'Quantity': 'sum',
    'TotalValue': 'sum'
}).reset_index()

In [6]:
# Pivot table for customer-product matrix
interaction_matrix = customer_product_matrix.pivot(index='CustomerID', columns='ProductID', values='TotalValue').fillna(0)

In [7]:
# Add customer-specific features (e.g., Region and SignupDate)
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['DaysSinceSignup'] = (pd.Timestamp.now() - customers_df['SignupDate']).dt.days

In [8]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
regions_encoded = encoder.fit_transform(customers_df[['Region']])
region_feature_names = encoder.get_feature_names_out(['Region'])
regions_df = pd.DataFrame(regions_encoded, columns=region_feature_names, index=customers_df['CustomerID'])

In [9]:
# Combine interaction matrix with customer features
final_features = interaction_matrix.join(regions_df).join(customers_df.set_index('CustomerID')['DaysSinceSignup'])
final_features = final_features.fillna(0)

In [10]:
# Save the preprocessed feature matrix for future use
joblib.dump(final_features, "customer_features.pkl")

['customer_features.pkl']

In [11]:
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(final_features)
similarity_df = pd.DataFrame(similarity_matrix, index=final_features.index, columns=final_features.index)

In [12]:
# Save the similarity matrix for dynamic use
joblib.dump(similarity_matrix, "similarity_matrix.pkl")

['similarity_matrix.pkl']

In [13]:
# Function to get top N similar customers for each customer
def get_top_similar_customers(similarity_df, top_n=3):
    lookalike_map = {}
    for customer_id in similarity_df.index:
        similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
        lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    return lookalike_map

In [14]:
# Get top 3 similar customers for the first 20 customers (C0001 to C0020)
first_20_customers = final_features.index[:20]
lookalike_map = {cust_id: get_top_similar_customers(similarity_df.loc[[cust_id]], top_n=3) for cust_id in first_20_customers}

In [15]:
# Convert the lookalike map to the required CSV format
lookalike_list = []
for cust_id, similar_customers in lookalike_map.items():
    lookalike_list.append({
        'CustomerID': cust_id,
        'Lookalikes': similar_customers[cust_id]
    })

lookalike_df = pd.DataFrame(lookalike_list)

In [16]:
# Save to Lookalike.csv
lookalike_df.to_csv('Rakshit_Anand_Lookalike.csv', index=False)

print("Rakshit_Anand_Lookalike.csv has been generated!")

Rakshit_Anand_Lookalike.csv has been generated!


In [17]:
# Real-Time Lookalike Function
def encode_region(region):
    region_vector = [1 if f"Region_{region}" == r else 0 for r in region_feature_names]
    return region_vector

# Real-time function to find lookalikes
def get_lookalikes(region, signup_date, transactions):
    """
    Parameters:
        region (str): Region of the customer (e.g., "South America").
        signup_date (str): Signup date in "YYYY-MM-DD" format.
        transactions (dict): Dictionary of ProductID: TotalValue.
        
    Returns:
        dict: Top 3 similar customers with similarity scores.
    """
    # One-hot encode the region
    region_vector = encode_region(region)
    
    # Calculate days since signup
    days_since_signup = (pd.Timestamp.now() - pd.to_datetime(signup_date)).days
    
    # Create an interaction vector for the customer's transactions
    transactions_vector = np.zeros(final_features.shape[1] - len(region_feature_names) - 1)
    for product_id, total_value in transactions.items():
        if product_id in interaction_matrix.columns:
            transactions_vector[interaction_matrix.columns.get_loc(product_id)] = total_value
    
    # Combine all features into a single vector
    input_vector = np.concatenate((transactions_vector, region_vector, [days_since_signup]))
    
    # Compute similarity scores
    input_similarity = cosine_similarity([input_vector], final_features)[0]
    
    # Get top 3 similar customers (excluding the customer itself)
    similar_customers = sorted(
        [(final_features.index[i], score) for i, score in enumerate(input_similarity)],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    
    return {"Top3_Lookalikes": similar_customers}

In [18]:
# Example real-time usage
example_input = {
    "region": "South America",
    "signup_date": "2021-06-10",  
    "transactions": {
        "P003": 150.0,  
        "P007": 200.0,  
        "P010": 50.0    
    }
}

real_time_output = get_lookalikes(
    region=example_input["region"],
    signup_date=example_input["signup_date"],
    transactions=example_input["transactions"]
)

print("Real-Time Lookalike Output:", real_time_output)

Real-Time Lookalike Output: {'Top3_Lookalikes': [('C0060', 0.9787963513061018), ('C0097', 0.9301140317457236), ('C0083', 0.9089302020490774)]}
