In [None]:
%pip install pandas numpy scikit-learn

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets to form a comprehensive dataset
merged_data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")

# Display the merged data structure
print("Merged Data:")
print(merged_data.head())

In [None]:
# Aggregate transaction and customer-level features
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum",    # Total quantity purchased
    "Price": "mean",      # Average product price
    "Category": lambda x: x.mode()[0] if not x.mode().empty else "Unknown",  # Most purchased category
    "Region": "first"     # Region (from Customers.csv)
}).reset_index()

# One-hot encode categorical columns
customer_features = pd.get_dummies(customer_features, columns=["Category", "Region"], drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_cols = ["TotalValue", "Quantity", "Price"]
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# Display the prepared features
print("Customer Features (after processing):")
print(customer_features.head())

In [None]:
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features.drop("CustomerID", axis=1))

# Convert similarity matrix to a DataFrame for better handling
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

# Display similarity matrix (for reference)
print("Similarity Matrix:")
print(similarity_df.head())

In [None]:
# Get the first 20 CustomerIDs
target_customers = customers.loc[customers["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)]), "CustomerID"]

# Generate lookalikes for each target customer
lookalike_map = {}
for customer in target_customers:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]  # Top 3 similar customers (exclude self)
    lookalike_map[customer] = list(zip(similar_customers.index, similar_customers.values))

# Convert lookalike map to a DataFrame
lookalike_data = []
for cust_id, lookalikes in lookalike_map.items():
    for similar_cust, score in lookalikes:
        lookalike_data.append({"cust_id": cust_id, "similar_cust_id": similar_cust, "score": score})

lookalike_df = pd.DataFrame(lookalike_data)

# Save the result to Lookalike.csv
lookalike_df.to_csv("Lookalike.csv", index=False)

# Display the Lookalike DataFrame
print("Lookalike Data:")
print(lookalike_df.head())