In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature Engineering: Create customer-product matrix
customer_product_matrix = merged_data.pivot_table(
    index="CustomerID", 
    columns="ProductName", 
    values="Quantity", 
    aggfunc="sum",
    fill_value=0
)

# Normalize data
scaler = StandardScaler()
customer_product_matrix_scaled = scaler.fit_transform(customer_product_matrix)

# Calculate similarity
similarity_matrix = cosine_similarity(customer_product_matrix_scaled)
similarity_df = pd.DataFrame(similarity_matrix, 
                              index=customer_product_matrix.index, 
                              columns=customer_product_matrix.index)

# Find top 3 lookalikes for each customer (C0001 - C0020)
lookalike_results = {}
for customer in similarity_df.index[:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    lookalike_results[customer] = [(similar_customer, round(similarity_score, 4)) 
                                   for similar_customer, similarity_score in similar_customers.items()]

# Convert results to DataFrame
lookalike_df = pd.DataFrame(
    [(cust, sim_cust, score) for cust, values in lookalike_results.items() for sim_cust, score in values],
    columns=["CustomerID", "SimilarCustomerID", "SimilarityScore"]
)

# Save results to CSV
lookalike_df.to_csv("Shreya_Naik_Lookalike.csv", index=False)

# Display example results
print(lookalike_df.head())

# Explanation:
# 1. Customers.csv and Products.csv are combined with Transactions.csv to create a unified dataset.
# 2. A customer-product matrix is created where rows represent customers and columns represent products.
# 3. Cosine similarity is used to find similar customers based on their purchase history.
# 4. The top 3 similar customers for each of the first 20 customers are extracted and saved in the required format.


  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0050           0.5103
1      C0001             C0121           0.4593
2      C0001             C0199           0.4331
3      C0002             C0030           0.6736
4      C0002             C0173           0.4377
