In [11]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


In [12]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [13]:
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')


In [14]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [15]:
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0],
}).reset_index()

In [16]:
customer_features.rename(columns={'TotalValue': 'TotalSpending', 'TransactionID': 'Frequency', 'Category': 'TopCategory'}, inplace=True)



In [17]:
customer_features

Unnamed: 0,CustomerID,TotalSpending,Frequency,TopCategory
0,C0001,3354.52,5,Electronics
1,C0002,1862.74,4,Clothing
2,C0003,2725.38,4,Home Decor
3,C0004,5354.88,8,Books
4,C0005,2034.24,3,Electronics
...,...,...,...,...
194,C0196,4982.88,4,Home Decor
195,C0197,1928.65,3,Electronics
196,C0198,931.83,2,Clothing
197,C0199,1979.28,4,Electronics


In [18]:
customer_features = pd.get_dummies(customer_features, columns=['TopCategory'], drop_first=True)


In [20]:
customer_features.head()

Unnamed: 0,CustomerID,TotalSpending,Frequency,TopCategory_Clothing,TopCategory_Electronics,TopCategory_Home Decor
0,C0001,3354.52,5,False,True,False
1,C0002,1862.74,4,True,False,False
2,C0003,2725.38,4,False,False,True
3,C0004,5354.88,8,False,False,False
4,C0005,2034.24,3,False,True,False


In [19]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features.drop(['CustomerID'], axis=1))


In [22]:
scaled_df = pd.DataFrame(scaled_features, columns=customer_features.columns[1:])
scaled_df['CustomerID'] = customer_features['CustomerID']


In [23]:
scaled_df.head()

Unnamed: 0,TotalSpending,Frequency,TopCategory_Clothing,TopCategory_Electronics,TopCategory_Home Decor,CustomerID
0,0.308942,0.4,0.0,1.0,0.0,C0001
1,0.168095,0.3,1.0,0.0,0.0,C0002
2,0.249541,0.3,0.0,0.0,1.0,C0003
3,0.497806,0.7,0.0,0.0,0.0,C0004
4,0.184287,0.2,0.0,1.0,0.0,C0005


In [24]:
similarity_matrix = cosine_similarity(scaled_features)


In [25]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [27]:
similarity_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.145106,0.163869,0.450725,0.979134,0.436015,0.982985,0.306891,0.090538,0.141813,...,0.450397,0.991129,0.449381,0.447818,0.238759,0.205499,0.978111,0.057344,0.990262,0.227259
C0002,0.145106,1.0,0.116239,0.323318,0.083018,0.298749,0.090016,0.226911,0.992088,0.999918,...,0.323817,0.108389,0.316356,0.325077,0.170685,0.138933,0.081624,0.978757,0.107221,0.970183
C0003,0.163869,0.116239,1.0,0.362482,0.095276,0.354524,0.10577,0.909265,0.072127,0.11345,...,0.362016,0.120218,0.363058,0.359083,0.982519,0.983385,0.093195,0.046214,0.118455,0.184214
C0004,0.450725,0.323318,0.362482,1.0,0.260335,0.955742,0.286272,0.686496,0.20293,0.316431,...,0.999881,0.333133,0.992046,0.996739,0.529239,0.448902,0.255185,0.126937,0.328787,0.499865
C0005,0.979134,0.083018,0.095276,0.260335,1.0,0.258543,0.998827,0.17402,0.051107,0.080872,...,0.259795,0.995719,0.262427,0.256817,0.138185,0.12275,0.999955,0.033289,0.995673,0.133769


In [26]:
def get_top_similar(customers_df, target_id, top_n=3):
    scores = customers_df.loc[target_id].sort_values(ascending=False)[1:top_n + 1]
    return [(idx, score) for idx, score in scores.items()]

In [28]:
lookalike_map = {}
for cust_id in customer_features['CustomerID'][:20]:
    lookalike_map[cust_id] = get_top_similar(similarity_df, cust_id)

In [29]:
lookalike_results = []
for cust_id, similar_list in lookalike_map.items():
    for sim_cust_id, score in similar_list:
        lookalike_results.append({'CustomerID': cust_id, 'SimilarCustomerID': sim_cust_id, 'Score': score})

In [30]:
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv('Lookalike.csv', index=False)