In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Merge transactions with customer and product data
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')

In [4]:
# 1. Total spending per customer
total_spending = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)

# 2. Number of transactions per customer
transaction_count = transactions.groupby('CustomerID')['TransactionID'].nunique().reset_index()
transaction_count.rename(columns={'TransactionID': 'TransactionCount'}, inplace=True)

# 3. Average transaction value per customer
avg_transaction_value = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.rename(columns={'TotalValue': 'AvgTransactionValue'}, inplace=True)

# 4. Preferred product categories per customer
category_preferences = transactions.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack(fill_value=0).reset_index()

# 5. Region (one-hot encoding)
region_dummies = pd.get_dummies(customers['Region'], prefix='Region')


In [5]:
# Combine all features into a single dataframe
customer_features = customers[['CustomerID']].copy()
customer_features = customer_features.merge(total_spending, on='CustomerID', how='left')
customer_features = customer_features.merge(transaction_count, on='CustomerID', how='left')
customer_features = customer_features.merge(avg_transaction_value, on='CustomerID', how='left')
customer_features = customer_features.merge(category_preferences, on='CustomerID', how='left')
customer_features = pd.merge(region_dummies, customer_features, left_index=True, right_index=True)

In [6]:
# Fill missing values with 0
customer_features = customer_features.fillna(0)

In [7]:
# Standardize the features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

In [8]:
#Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features_scaled)

In [9]:
# Create a dataframe for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [10]:
# Function to get top 3 similar customers
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    # Get similarity scores for the given customer
    similarities = similarity_df[customer_id].sort_values(ascending=False)
    # Exclude the customer itself
    similarities = similarities.drop(customer_id)
    # Return top N similar customers with their scores
    return similarities.head(top_n).reset_index().values.tolist()

In [11]:
# Generate recommendations for the first 20 customers (C0001 - C0020)
lookalike_map = {}
for customer_id in customer_features['CustomerID'][:20]:
    top_similar_customers = get_top_similar_customers(customer_id, similarity_df)
    lookalike_map[customer_id] = top_similar_customers

In [12]:
# Save the results to a CSV file
lookalike_df = pd.DataFrame(lookalike_map.items(), columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Shivaraj_Kottalagi_Lookalike.csv', index=False)

In [13]:
# Display the results
print("Lookalike Recommendations:")
print(lookalike_df)

Lookalike Recommendations:
   CustomerID                                         Lookalikes
0       C0001  [[C0120, 0.860193858504215], [C0181, 0.8247506...
1       C0002  [[C0178, 0.9328767066039853], [C0159, 0.929374...
2       C0003  [[C0031, 0.8723876657395959], [C0133, 0.863771...
3       C0004  [[C0012, 0.9331206487293933], [C0113, 0.918595...
4       C0005  [[C0007, 0.9539797213085559], [C0140, 0.913135...
5       C0006  [[C0187, 0.8712322215405164], [C0158, 0.770777...
6       C0007  [[C0005, 0.9539797213085559], [C0140, 0.934472...
7       C0008  [[C0109, 0.8346420065150637], [C0194, 0.816878...
8       C0009  [[C0198, 0.971648593685618], [C0060, 0.9434913...
9       C0010  [[C0111, 0.9052530518400612], [C0062, 0.822599...
10      C0011  [[C0107, 0.9118210727341978], [C0190, 0.850183...
11      C0012  [[C0004, 0.9331206487293933], [C0113, 0.927538...
12      C0013  [[C0099, 0.9388776662608045], [C0188, 0.873803...
13      C0014  [[C0060, 0.9802150292925259], [C0198, 0.951988..