In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

print("Customers Dataset:")
display(customers_df.head())

print("Products Dataset:")
display(products_df.head())

print("Transactions Dataset:")
display(transactions_df.head())


Customers Dataset:


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


Products Dataset:


Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


Transactions Dataset:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [10]:
transactions_products = pd.merge(transactions_df, products_df, on='ProductID', how='left')
full_data = pd.merge(transactions_products, customers_df, on='CustomerID', how='left')


print("Merged Dataset:")
display(full_data.head())


Merged Dataset:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [11]:

customer_profiles = full_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  
    'ProductID': 'count',  
    'Category': lambda x: ','.join(x)  
}).reset_index()


customer_profiles.rename(columns={
    'TotalValue': 'TotalSpending',
    'ProductID': 'ProductCount',
    'Category': 'PurchasedCategories'
}, inplace=True)


print("Customer Profiles:")
display(customer_profiles.head())


Customer Profiles:


Unnamed: 0,CustomerID,TotalSpending,ProductCount,PurchasedCategories
0,C0001,3354.52,5,"Books,Home Decor,Electronics,Electronics,Elect..."
1,C0002,1862.74,4,"Home Decor,Home Decor,Clothing,Clothing"
2,C0003,2725.38,4,"Home Decor,Home Decor,Clothing,Electronics"
3,C0004,5354.88,8,"Books,Home Decor,Home Decor,Home Decor,Books,B..."
4,C0005,2034.24,3,"Home Decor,Electronics,Electronics"


In [12]:
categories = customer_profiles['PurchasedCategories'].str.get_dummies(sep=',')
encoded_profiles = pd.concat([customer_profiles[['CustomerID', 'TotalSpending', 'ProductCount']], categories], axis=1)


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
encoded_profiles[['TotalSpending', 'ProductCount']] = scaler.fit_transform(encoded_profiles[['TotalSpending', 'ProductCount']])


print("Encoded Customer Profiles:")
display(encoded_profiles.head())


Encoded Customer Profiles:


Unnamed: 0,CustomerID,TotalSpending,ProductCount,Books,Clothing,Electronics,Home Decor
0,C0001,0.308942,0.4,1,0,1,1
1,C0002,0.168095,0.3,0,1,0,1
2,C0003,0.249541,0.3,0,1,1,1
3,C0004,0.497806,0.7,1,0,1,1
4,C0005,0.184287,0.2,0,0,1,1


In [13]:
features = encoded_profiles.drop('CustomerID', axis=1)

similarity_matrix = cosine_similarity(features)

similarity_df = pd.DataFrame(similarity_matrix, index=encoded_profiles['CustomerID'], columns=encoded_profiles['CustomerID'])

print("Similarity Matrix:")
display(similarity_df.head())

Similarity Matrix:


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.44628,0.685854,0.984374,0.822405,0.689659,0.82424,0.866015,0.427682,0.445132,...,0.832157,0.682837,0.618585,0.876558,0.697026,0.689991,0.821928,0.415587,0.827637,0.875476
C0002,0.44628,1.0,0.825041,0.459757,0.520507,0.822492,0.521922,0.720363,0.515389,0.527376,...,0.076404,0.435911,0.086811,0.726541,0.825818,0.819407,0.520156,0.504898,0.52831,0.722436
C0003,0.685854,0.825041,1.0,0.68002,0.823652,0.684201,0.824389,0.851389,0.818724,0.437184,...,0.447758,0.680808,0.082901,0.869189,0.992195,0.68338,0.823389,0.813117,0.825359,0.871814
C0004,0.984374,0.459757,0.68002,1.0,0.801557,0.690701,0.80657,0.888209,0.42608,0.45781,...,0.836103,0.673972,0.645393,0.878481,0.712449,0.694407,0.800465,0.40429,0.816355,0.867446
C0005,0.822405,0.520507,0.823652,0.801557,1.0,0.436525,0.999377,0.701968,0.511759,0.042244,...,0.525103,0.429953,0.071321,0.716489,0.818012,0.437503,0.999976,0.506006,0.997676,0.719179


In [14]:
def get_top_n_similar(customer_id, n=3):
    similar_scores = similarity_df[customer_id].sort_values(ascending=False)[1:n+1]  
    return [(other_id, score) for other_id, score in similar_scores.items()]

recommendations = {}
for customer_id in encoded_profiles['CustomerID'][:20]:
    recommendations[customer_id] = get_top_n_similar(customer_id)

print("Top 3 Recommendations for First 20 Customers:")
for cust_id, recs in recommendations.items():
    print(f"{cust_id}: {recs}")


Top 3 Recommendations for First 20 Customers:
C0001: [('C0152', 0.9999986954105654), ('C0174', 0.9997133867664305), ('C0064', 0.9993841135815794)]
C0002: [('C0133', 0.9978729065754757), ('C0062', 0.9975929318317053), ('C0159', 0.9975742711692481)]
C0003: [('C0166', 0.9998573951675882), ('C0031', 0.999274487916083), ('C0026', 0.9990724512457297)]
C0004: [('C0012', 0.998803808464272), ('C0047', 0.9918475886529601), ('C0065', 0.9917533987114977)]
C0005: [('C0197', 0.9999763907119462), ('C0007', 0.9993770871840076), ('C0199', 0.9976759733900089)]
C0006: [('C0079', 0.9999992468255506), ('C0196', 0.9992664958948604), ('C0135', 0.9984028585872425)]
C0007: [('C0005', 0.9993770871840076), ('C0197', 0.9991109908572395), ('C0199', 0.9968864218468423)]
C0008: [('C0109', 0.998146520324587), ('C0093', 0.9980731947487109), ('C0147', 0.9976960097360833)]
C0009: [('C0198', 0.9975618866462608), ('C0092', 0.9885271801816342), ('C0066', 0.9825358351679687)]
C0010: [('C0142', 0.9994375734691648), ('C0132',

In [15]:
output_data = []
for cust_id, recs in recommendations.items():
    output_data.append({
        'CustomerID': cust_id,
        'Lookalikes': str(recs)  
    })

lookalike_df = pd.DataFrame(output_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv saved successfully!")


Lookalike.csv saved successfully!


In [16]:
customer_id = 'C0001'

customer_vector = features.loc[encoded_profiles['CustomerID'] == customer_id].values

similar_customers = get_top_n_similar(customer_id, n=3)
print(f"Customer {customer_id} Top Recommendations:")
for cust, score in similar_customers:
    print(f"Customer: {cust}, Similarity Score: {score}")

similar_profiles = customer_profiles[customer_profiles['CustomerID'].isin([cust for cust, _ in similar_customers])]
display(similar_profiles)


Customer C0001 Top Recommendations:
Customer: C0152, Similarity Score: 0.9999986954105654
Customer: C0174, Similarity Score: 0.9997133867664305
Customer: C0064, Similarity Score: 0.9993841135815794


Unnamed: 0,CustomerID,TotalSpending,ProductCount,PurchasedCategories
63,C0064,4039.77,5,"Books,Books,Home Decor,Electronics,Books"
151,C0152,3385.86,5,"Home Decor,Books,Electronics,Home Decor,Home D..."
173,C0174,2891.95,5,"Home Decor,Books,Books,Books,Electronics"
