In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Merge datasets for comprehensive analysis
merged_data = pd.merge(transactions, customers, on="CustomerID", how="inner")
merged_data = pd.merge(merged_data, products, on="ProductID", how="inner")
merged_data.rename(columns={'Price_x': 'Price_product', 'Price_y': 'Price_Transaction'}, inplace=True)

In [5]:
# Prepare data for lookalike model
customer_profiles = merged_data.groupby("CustomerID").agg({
    "Quantity": "sum",
    "TotalValue": "sum",
    "Price_product": "mean",
    "Category": lambda x: ' '.join(x),
    "Region": "first"
}).reset_index()

# Encode categorical data
customer_profiles = pd.get_dummies(customer_profiles, columns=["Region", "Category"], drop_first=True)


In [6]:
# Scale the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles.drop(columns=["CustomerID"]))

# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

In [7]:
# Generate lookalike recommendations for customers C0001 to C0020
customer_ids = customer_profiles["CustomerID"][:20]
lookalike_results = {}

for i, cust_id in enumerate(customer_ids):
    similar_indices = np.argsort(-similarity_matrix[i])[1:4]  # Get top 3 similar customers excluding itself
    similar_customers = customer_profiles.iloc[similar_indices]["CustomerID"].values
    similarity_scores = similarity_matrix[i][similar_indices]
    lookalike_results[cust_id] = list(zip(similar_customers, similarity_scores))

# Create Lookalike.csv
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "Lookalikes": [str(v) for v in lookalike_results.values()]
})

In [9]:
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0151', 0.019358894447456913), ('C0150', 0...."
1,C0002,"[('C0133', 0.9655453160381162), ('C0097', 0.06..."
2,C0003,"[('C0151', 0.05697007319523585), ('C0077', 0.0..."
3,C0004,"[('C0165', 0.04265775527464802), ('C0082', 0.0..."
4,C0005,"[('C0197', 0.9696723410296022), ('C0078', 0.11..."


In [8]:
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv
