In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import defaultdict

customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Merge datasets to create a complete dataset
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')

# Feature Engineering
customer_data = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    most_frequent_product_category=('Category', lambda x: x.mode()[0])
).reset_index()

# Encoding categorical data
customer_data['Region'] = customer_data['CustomerID'].map(customers.set_index('CustomerID')['Region'])
customer_data['most_frequent_product_category'] = customer_data['most_frequent_product_category'].astype('category').cat.codes
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_region = encoder.fit_transform(customer_data[['Region']])
encoded_region_df = pd.DataFrame(encoded_region, columns=encoder.get_feature_names_out(['Region']))

customer_data = pd.concat([customer_data, encoded_region_df], axis=1)
customer_data = customer_data.drop('Region', axis=1)

# Normalize the numerical features
scaler = StandardScaler()
customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']] = scaler.fit_transform(
    customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']])

# Create feature vectors for each customer
customer_features = customer_data.drop('CustomerID', axis=1)

# Calculate similarity matrix using cosine similarity
similarity_matrix = cosine_similarity(customer_features)

# Function to get top 3 lookalike customers for each customer
def get_lookalikes(customer_id, similarity_matrix, top_n=3):
    customer_index = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))

    # Sort by similarity score, ignore self-comparison
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_lookalikes = sorted_scores[1:top_n+1]
    lookalike_dict = [(customer_data.iloc[i[0]]['CustomerID'], i[1]) for i in top_lookalikes]

    return lookalike_dict

# Generate Lookalike recommendations for the first 20 customers (C0001 - C0020)
lookalike_data = {}
for customer_id in customers['CustomerID'][:20]:
    lookalike_data[customer_id] = get_lookalikes(customer_id, similarity_matrix)
lookalike_df = pd.DataFrame([(cust_id, rec[0], rec[1]) for cust_id, recs in lookalike_data.items() for rec in recs],
                            columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save the results to 'Lookalike.csv'
lookalike_df.to_csv('/content/Lookalike.csv', index=False)
lookalike_df.head(20)


Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0152,0.989715
1,C0001,C0190,0.986583
2,C0001,C0048,0.975668
3,C0002,C0088,0.966357
4,C0002,C0134,0.941709
5,C0002,C0097,0.9342
6,C0003,C0052,0.996258
7,C0003,C0152,0.983552
8,C0003,C0001,0.975085
9,C0004,C0165,0.973831
