In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [35]:
cust=pd.read_csv("/Users/kadiresindhureddy/Downloads/Customers.csv")
prod=pd.read_csv("/Users/kadiresindhureddy/Downloads/Products.csv")
trans=pd.read_csv("/Users/kadiresindhureddy/Downloads/Transactions.csv")

In [37]:
# Convert date columns to datetime format
cust['SignupDate'] = pd.to_datetime(cust['SignupDate'])
trans['TransactionDate'] = pd.to_datetime(trans['TransactionDate'])


In [39]:
# Encode categorical variables (Region)
label_encoder = LabelEncoder()
cust['Region'] = label_encoder.fit_transform(cust['Region'])

In [41]:
# Extract year from signup date as a numeric feature
cust['SignupYear'] = cust['SignupDate'].dt.year

In [43]:
# Drop unnecessary columns
cust = cust.drop(columns=['SignupDate', 'CustomerName'])


In [45]:
# Create customer-product purchase matrix
customer_product_matrix = trans.pivot_table(
    index='CustomerID', 
    columns='ProductID', 
    values='Quantity', 
    aggfunc='sum', 
    fill_value=0
)

In [47]:
# Merge customer demographic features with the purchase matrix
features = cust.set_index('CustomerID').join(customer_product_matrix, how='left').fillna(0)

# Ensure all features are numeric
features_numeric = features.select_dtypes(include=[np.number])

# Standardize the feature values to normalize different scales
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_numeric)

In [49]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(features_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=features.index, columns=features.index)

In [51]:
# Generate top 3 lookalike recommendations for each customer
lookalike_map = {}
for customer_id in similarity_df.index:
    similar_customers = (
        similarity_df.loc[customer_id]
        .sort_values(ascending=False)[1:4]  # Exclude the customer itself
        .items()
    )
    lookalike_map[customer_id] = [(sim_id, round(score, 4)) for sim_id, score in similar_customers]


In [53]:
# Save lookalikes for the first 20 customers
lookalike_subset = {k: lookalike_map[k] for k in list(lookalike_map.keys())[:20]}

# Convert lookalike subset to a DataFrame for saving
lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_subset.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

In [55]:
# Print results for the first 20 customers
print("Top 3 Lookalikes for the first 20 Customers:")
print(lookalike_df.head(20))

Top 3 Lookalikes for the first 20 Customers:
   cust_id                                         lookalikes
0    C0001  [(C0104, 0.3981), (C0194, 0.3768), (C0199, 0.3...
1    C0002  [(C0030, 0.376), (C0091, 0.3636), (C0173, 0.33...
2    C0003  [(C0181, 0.4821), (C0134, 0.4332), (C0144, 0.4...
3    C0004  [(C0070, 0.3527), (C0175, 0.3107), (C0132, 0.2...
4    C0005  [(C0023, 0.4778), (C0096, 0.4646), (C0119, 0.3...
5    C0006  [(C0040, 0.4289), (C0058, 0.3552), (C0178, 0.3...
6    C0007  [(C0079, 0.6116), (C0118, 0.4536), (C0020, 0.3...
7    C0008  [(C0144, 0.3346), (C0169, 0.2787), (C0091, 0.2...
8    C0009  [(C0140, 0.5469), (C0162, 0.506), (C0180, 0.46...
9    C0010  [(C0094, 0.5095), (C0092, 0.3897), (C0143, 0.3...
10   C0011  [(C0135, 0.5099), (C0120, 0.3756), (C0107, 0.3...
11   C0012  [(C0164, 0.4762), (C0158, 0.3815), (C0173, 0.3...
12   C0013  [(C0169, 0.4534), (C0092, 0.3533), (C0099, 0.3...
13   C0014  [(C0128, 0.8449), (C0159, 0.6801), (C0058, 0.4...
14   C0015  [(C0073, 0.71