In [2]:
# Re-importing necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [5]:

# File paths
customers_path = "Customers.csv"
products_path = "Products.csv"
transactions_path = "Transactions.csv"


In [6]:
customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

In [7]:

# Convert date columns to datetime format
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

# Merge transaction and customer data
customer_transactions = transactions.merge(customers, on="CustomerID")



In [8]:
# Aggregate transaction data per customer
customer_features = customer_transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_quantity=("Quantity", "mean"),
    unique_products=("ProductID", "nunique")
).reset_index()



In [9]:
# Add customer profile data (Region)
customer_features = customer_features.merge(customers[["CustomerID", "Region"]], on="CustomerID")

# Convert categorical variables (Region) into numerical form
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features.drop(columns=["CustomerID"]))


In [10]:

# Compute cosine similarity
cosine_sim = cosine_similarity(features_scaled)

# Create Lookalike recommendations
lookalikes = {}
customer_ids = customer_features["CustomerID"].tolist()

for i in range(20):  # For customers C0001 to C0020
    cust_id = f"C{str(i+1).zfill(4)}"
    if cust_id in customer_ids:
        cust_idx = customer_ids.index(cust_id)
        sim_scores = list(enumerate(cosine_sim[cust_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self, pick top 3
        similar_customers = [(customer_ids[x[0]], round(x[1], 4)) for x in sim_scores]
        lookalikes[cust_id] = similar_customers



In [11]:
# Prepare DataFrame for Lookalike.csv
lookalike_data = []
for cust_id, similar_list in lookalikes.items():
    for similar_cust, score in similar_list:
        lookalike_data.append([cust_id, similar_cust, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=["CustomerID", "LookalikeID", "SimilarityScore"])


In [13]:
# Prepare DataFrame for Lookalike.csv
lookalike_data = []
for cust_id, similar_list in lookalikes.items():
    for similar_cust, score in similar_list:
        lookalike_data.append([cust_id, similar_cust, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=["CustomerID", "LookalikeID", "SimilarityScore"])

# Save results to CSV
lookalike_csv_path = "Vignesh_Elgeti_Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)

# Return path of generated file
lookalike_csv_path


'Vignesh_Elgeti_Lookalike.csv'