In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [5]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [6]:
merged_data = transactions.merge(customers, on='CustomerID')
merged_data = merged_data.merge(products, on='ProductID')
merged_data.rename(columns={'Price_x': 'Transaction_Price', 'Price_y': 'Product_Price'}, inplace=True)
merged_data.drop(columns=['Product_Price'], inplace=True)
merged_data.rename(columns={'Transaction_Price': 'Price'}, inplace=True)
print(merged_data[['Price', 'Quantity', 'TotalValue']].describe())

            Price     Quantity   TotalValue
count  1000.00000  1000.000000  1000.000000
mean    272.55407     2.537000   689.995560
std     140.73639     1.117981   493.144478
min      16.08000     1.000000    16.080000
25%     147.95000     2.000000   295.295000
50%     299.93000     3.000000   588.880000
75%     404.40000     4.000000  1011.660000
max     497.76000     4.000000  1991.040000


In [None]:
customer_product_matrix = merged_data.pivot_table(
    index='CustomerID', 
    columns='ProductID', 
    values='Quantity', 
    aggfunc='sum', 
    fill_value=0
)

scaler = StandardScaler()
normalized_matrix = scaler.fit_transform(customer_product_matrix)

similarity_matrix = cosine_similarity(normalized_matrix)
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=customer_product_matrix.index, 
                             columns=customer_product_matrix.index)

# Function to get top 3 similar customers for a given customer
def get_top_3_similar(customers_similarity, customer_id):
    similar_customers = customers_similarity[customer_id].sort_values(ascending=False).iloc[1:4]
    return [(cust, round(score, 2)) for cust, score in similar_customers.items()]

# Generate Lookalike recommendations for the first 20 customers
lookalike_map = {}
for customer_id in customers['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_3_similar(similarity_df, customer_id)

# Save the Lookalike map to a CSV file
lookalike_output = []
for cust_id, similar_customers in lookalike_map.items():
    for similar_cust_id, score in similar_customers:
        lookalike_output.append({'cust_id': cust_id, 'similar_cust_id': similar_cust_id, 'score': score})

lookalike_df = pd.DataFrame(lookalike_output)
lookalike_df.to_csv('Lookalike.csv', index=False)

print(lookalike_df.head())