In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Aggregate transaction data by customer
transaction_summary = transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'}).reset_index()

# Merging customer and transaction data
customer_data = pd.merge(customers, transaction_summary, on='CustomerID')

# Feature Engineering (using TotalValue and Quantity for simplicity)
features = customer_data[['TotalValue', 'Quantity']]

# Normalize data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(features_scaled)

# Function to get top N similar customers
def get_top_n_similar(customer_id, cosine_sim, n=3):
    idx = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    similar_customers = [(customer_data.iloc[i[0]]['CustomerID'], i[1]) for i in sorted_scores]
    return similar_customers

# Generate Lookalikes for first 20 customers (C0001 to C0020)
lookalike_map = {}
for i in range(1, 21):
    customer_id = f'C{i:04d}'
    top_similar_customers = get_top_n_similar(customer_id, cosine_sim)
    lookalike_map[customer_id] = top_similar_customers

# Convert the lookalike_map to a DataFrame
lookalike_data = []
for customer_id, similar_customers in lookalike_map.items():
    for similar_customer, score in similar_customers:
        lookalike_data.append([customer_id, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first few entries of the lookalike dataframe
print(lookalike_df.head())


  CustomerID LookalikeID  SimilarityScore
0      C0001       C0085         0.999999
1      C0001       C0042         0.999822
2      C0001       C0089         0.999785
3      C0002       C0157         0.999994
4      C0002       C0166         0.999875
