In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler



In [2]:
df_Customer=pd.read_csv('Customers.csv')
df_Products=pd.read_csv('Products.csv')
df_Transaction=pd.read_csv('Transactions.csv')

In [3]:
# Merge datasets
merged_data = df_Transaction.merge(df_Customer, on="CustomerID", how="left").merge(df_Products, on="ProductID", how="left")

In [4]:
# Feature engineering
# (a) Total spending per customer
customer_spending = merged_data.groupby("CustomerID")["TotalValue"].sum().rename("TotalSpending")

# (b) Average transaction value per customer
avg_transaction_value = merged_data.groupby("CustomerID")["TotalValue"].mean().rename("AvgTransactionValue")

# (c) Product category preferences (one-hot encoding)
category_pref = pd.crosstab(merged_data["CustomerID"], merged_data["Category"])

# (d) Combine all features into a single dataframe
customer_features = pd.concat([customer_spending, avg_transaction_value, category_pref], axis=1).fillna(0)

# Normalize features for similarity calculation
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features)

In [5]:
from sklearn.metrics.pairwise import cosine_similarity


# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(normalized_features)

# Convert similarity matrix to a DataFrame for easy handling
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

# Function to get top 3 similar customers
def get_top_3_lookalikes(customer_id, similarity_df):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
    return list(zip(similar_customers.index, similar_customers.values))

# Generate lookalikes for the first 20 customers (C0001 - C0020)
lookalike_map = {}
for customer_id in df_Customer["CustomerID"][:20]:
    lookalike_map[customer_id] = get_top_3_lookalikes(customer_id, similarity_df)

# Save the result as Lookalike.csv
lookalike_df = pd.DataFrame(
    [
        {"cust_id": key, "lookalikes": value}
        for key, value in lookalike_map.items()
    ]
)
lookalike_df["lookalikes"] = lookalike_df["lookalikes"].apply(
    lambda x: [{"cust_id": item[0], "score": round(item[1], 3)} for item in x]
)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model complete. Results saved to Lookalike.csv.")


Lookalike model complete. Results saved to Lookalike.csv.


In [6]:
Lookalike=pd.read_csv('Lookalike.csv')
Lookalike.head()

Unnamed: 0,cust_id,lookalikes
0,C0001,"[{'cust_id': 'C0069', 'score': 0.941}, {'cust_..."
1,C0002,"[{'cust_id': 'C0103', 'score': 0.921}, {'cust_..."
2,C0003,"[{'cust_id': 'C0166', 'score': 0.971}, {'cust_..."
3,C0004,"[{'cust_id': 'C0122', 'score': 0.881}, {'cust_..."
4,C0005,"[{'cust_id': 'C0197', 'score': 0.997}, {'cust_..."
