In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

In [4]:
customer_transactions = pd.merge(transactions, customers, on="CustomerID", how="inner")
data = pd.merge(customer_transactions, products, on="ProductID", how="inner")

In [5]:
customer_profiles = data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'ProductID': 'count',
    'Category': lambda x: x.mode()[0]
}).reset_index()

In [6]:
customer_profiles.columns = ['CustomerID', 'TotalSpent', 'AvgTransactionValue', 'NumPurchases', 'TopCategory']

In [7]:
customer_profiles

Unnamed: 0,CustomerID,TotalSpent,AvgTransactionValue,NumPurchases,TopCategory
0,C0001,3354.52,670.904000,5,Electronics
1,C0002,1862.74,465.685000,4,Clothing
2,C0003,2725.38,681.345000,4,Home Decor
3,C0004,5354.88,669.360000,8,Books
4,C0005,2034.24,678.080000,3,Electronics
...,...,...,...,...,...
194,C0196,4982.88,1245.720000,4,Home Decor
195,C0197,1928.65,642.883333,3,Electronics
196,C0198,931.83,465.915000,2,Clothing
197,C0199,1979.28,494.820000,4,Electronics


In [8]:
customer_profiles = pd.get_dummies(customer_profiles, columns=['TopCategory'])

In [9]:
scaler = StandardScaler()
numeric_features = ['TotalSpent', 'AvgTransactionValue', 'NumPurchases']
customer_profiles[numeric_features] = scaler.fit_transform(customer_profiles[numeric_features])

In [10]:
similarity_matrix = cosine_similarity(customer_profiles.drop(columns=['CustomerID']))

In [11]:
lookalike_map = {}
for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalike_map[customer_id] = [(customer_profiles['CustomerID'][s[0]], round(s[1], 2)) for s in scores]

In [12]:
filtered_map = {k: lookalike_map[k] for k in customer_profiles['CustomerID'][:20]}

In [13]:
filtered_map

{'C0001': [('C0072', 0.95), ('C0190', 0.94), ('C0069', 0.91)],
 'C0002': [('C0029', 1.0), ('C0010', 1.0), ('C0009', 0.97)],
 'C0003': [('C0178', 1.0), ('C0052', 0.98), ('C0166', 0.96)],
 'C0004': [('C0021', 1.0), ('C0101', 1.0), ('C0075', 0.99)],
 'C0005': [('C0112', 1.0), ('C0197', 1.0), ('C0095', 0.98)],
 'C0006': [('C0117', 1.0), ('C0168', 0.98), ('C0185', 0.96)],
 'C0007': [('C0120', 0.99), ('C0140', 0.98), ('C0020', 0.94)],
 'C0008': [('C0113', 0.93), ('C0124', 0.9), ('C0109', 0.86)],
 'C0009': [('C0077', 1.0), ('C0083', 1.0), ('C0033', 0.98)],
 'C0010': [('C0029', 1.0), ('C0002', 1.0), ('C0009', 0.98)],
 'C0011': [('C0064', 0.97), ('C0137', 0.92), ('C0135', 0.87)],
 'C0012': [('C0104', 0.97), ('C0059', 0.95), ('C0065', 0.94)],
 'C0013': [('C0143', 1.0), ('C0099', 0.99), ('C0053', 0.97)],
 'C0014': [('C0128', 1.0), ('C0151', 1.0), ('C0097', 0.98)],
 'C0015': [('C0132', 0.98), ('C0036', 0.98), ('C0131', 0.98)],
 'C0016': [('C0183', 1.0), ('C0107', 0.97), ('C0149', 0.94)],
 'C0017':

In [15]:
lookalike_df = pd.DataFrame([{'cust_id': k, 'lookalikes': v} for k, v in filtered_map.items()])
lookalike_df.to_csv("Tejeshwar_Kathiravan_Lookalike.csv", index=False)

print("Lookalike.csv has been created successfully!")

Lookalike.csv has been created successfully!
