In [1]:
import pandas as pd

### Load datasets

In [2]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

### Prepare data for similarity calculation

In [4]:
customer_profiles = transactions.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    TotalQuantity=('Quantity', 'sum'),
    NumTransactions=('TransactionID', 'count')
).reset_index()

customer_profiles

Unnamed: 0,CustomerID,TotalSpent,TotalQuantity,NumTransactions
0,C0001,3354.52,12,5
1,C0002,1862.74,10,4
2,C0003,2725.38,14,4
3,C0004,5354.88,23,8
4,C0005,2034.24,7,3
...,...,...,...,...
194,C0196,4982.88,12,4
195,C0197,1928.65,9,3
196,C0198,931.83,3,2
197,C0199,1979.28,9,4


### Merge with customer demographics

In [5]:
customer_profiles = customer_profiles.merge(customers, on="CustomerID", how="left")

customer_profiles

Unnamed: 0,CustomerID,TotalSpent,TotalQuantity,NumTransactions,CustomerName,Region,SignupDate
0,C0001,3354.52,12,5,Lawrence Carroll,South America,2022-07-10
1,C0002,1862.74,10,4,Elizabeth Lutz,Asia,2022-02-13
2,C0003,2725.38,14,4,Michael Rivera,South America,2024-03-07
3,C0004,5354.88,23,8,Kathleen Rodriguez,South America,2022-10-09
4,C0005,2034.24,7,3,Laura Weber,Asia,2022-08-15
...,...,...,...,...,...,...,...
194,C0196,4982.88,12,4,Laura Watts,Europe,2022-06-07
195,C0197,1928.65,9,3,Christina Harvey,Europe,2023-03-21
196,C0198,931.83,3,2,Rebecca Ray,Europe,2022-02-27
197,C0199,1979.28,9,4,Andrea Jenkins,Europe,2022-12-03


### One-hot encode categorical data

In [6]:
customer_profiles = pd.get_dummies(customer_profiles, columns=["Region"], drop_first=True)
customer_profiles

Unnamed: 0,CustomerID,TotalSpent,TotalQuantity,NumTransactions,CustomerName,SignupDate,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,12,5,Lawrence Carroll,2022-07-10,False,False,True
1,C0002,1862.74,10,4,Elizabeth Lutz,2022-02-13,False,False,False
2,C0003,2725.38,14,4,Michael Rivera,2024-03-07,False,False,True
3,C0004,5354.88,23,8,Kathleen Rodriguez,2022-10-09,False,False,True
4,C0005,2034.24,7,3,Laura Weber,2022-08-15,False,False,False
...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,12,4,Laura Watts,2022-06-07,True,False,False
195,C0197,1928.65,9,3,Christina Harvey,2023-03-21,True,False,False
196,C0198,931.83,3,2,Rebecca Ray,2022-02-27,True,False,False
197,C0199,1979.28,9,4,Andrea Jenkins,2022-12-03,True,False,False


### Scale the data

In [7]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(customer_profiles.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']))

scaled_data

array([[-0.06170143, -0.12203296, -0.01145819, -0.57928445, -0.54831888,
         1.54041597],
       [-0.87774353, -0.44800021, -0.46749414, -0.57928445, -0.54831888,
        -0.6491753 ],
       [-0.40585722,  0.20393428, -0.46749414, -0.57928445, -0.54831888,
         1.54041597],
       ...,
       [-1.38697529, -1.58888557, -1.37956603,  1.72626765, -0.54831888,
        -0.6491753 ],
       [-0.81399315, -0.61098383, -0.46749414,  1.72626765, -0.54831888,
        -0.6491753 ],
       [ 0.70636652,  0.52990153, -0.01145819, -0.57928445, -0.54831888,
        -0.6491753 ]])

### Calculate similarity scores

In [8]:
similarity_matrix = cosine_similarity(scaled_data)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,-0.095680,0.934891,0.530890,-0.059342,0.934017,-0.079354,-0.325263,-0.268882,-0.422674,...,0.989767,0.863789,-0.148611,-0.441002,0.856848,-0.462467,-0.378155,-0.254480,-0.405059,-0.200057
C0002,-0.095680,1.000000,0.043071,-0.599698,0.939178,-0.164442,0.910784,-0.450039,0.465470,0.253011,...,-0.002019,0.276753,0.908916,-0.283360,-0.396157,-0.227224,0.327699,0.483087,0.278300,0.101177
C0003,0.934891,0.043071,1.000000,0.419987,0.057060,0.885906,0.034904,-0.436800,-0.183052,-0.273710,...,0.929618,0.868546,-0.046034,-0.515606,0.781295,-0.462354,-0.238794,-0.138796,-0.309054,-0.213791
C0004,0.530890,-0.599698,0.419987,1.000000,-0.733285,0.476839,-0.697686,0.395848,-0.794886,-0.542460,...,0.429617,0.047458,-0.492785,0.018990,0.887004,-0.265022,-0.691169,-0.818464,-0.638543,0.308848
C0005,-0.059342,0.939178,0.057060,-0.733285,1.000000,-0.042154,0.986855,-0.624653,0.570299,0.250664,...,0.035126,0.359170,0.888309,-0.349205,-0.464716,-0.096249,0.412649,0.610718,0.332219,0.007199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.462467,-0.227224,-0.462354,-0.265022,-0.096249,-0.282143,-0.043289,-0.341975,0.488635,0.663441,...,-0.490083,-0.447185,-0.090663,-0.383794,-0.414536,1.000000,0.687718,0.497414,0.693650,0.085224
C0197,-0.378155,0.327699,-0.238794,-0.691169,0.412649,-0.351680,0.370256,-0.610289,0.939020,0.958796,...,-0.307852,-0.030614,0.190450,-0.513159,-0.605264,0.687718,1.000000,0.945064,0.982323,-0.368401
C0198,-0.254480,0.483087,-0.138796,-0.818464,0.610718,-0.245283,0.551865,-0.689978,0.990445,0.840272,...,-0.155255,0.185269,0.308943,-0.487371,-0.623458,0.497414,0.945064,1.000000,0.913263,-0.484813
C0199,-0.405059,0.278300,-0.309054,-0.638543,0.332219,-0.424052,0.277027,-0.507880,0.932712,0.973591,...,-0.331416,-0.082775,0.120128,-0.463381,-0.596654,0.693650,0.982323,0.913263,1.000000,-0.388117


### Get top 3 similar customers for each customer

In [9]:
lookalikes = {}
for customer_id in customer_profiles['CustomerID']:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalikes[customer_id] = list(similar_customers.items())

### Save lookalikes to CSV

In [10]:
lookalikes_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['1st Similar', '2nd Similar', '3rd Similar'])
lookalikes_df.to_csv("ShwetaJadhav_Lookalike.csv", index_label="CustomerID")

### Print the lookalikes for the first 20 customers

In [11]:
print("Lookalike Recommendations for the first 20 customers:")
print(lookalikes_df.head(20))

Lookalike Recommendations for the first 20 customers:
                       1st Similar                  2nd Similar  \
C0001  (C0107, 0.9964160629333633)  (C0137, 0.9957000029067153)   
C0002  (C0142, 0.9887986276382208)  (C0177, 0.9665054843287145)   
C0003  (C0190, 0.9663449212719497)  (C0133, 0.9639721457875143)   
C0004  (C0113, 0.9950141093849689)  (C0102, 0.9835928774397132)   
C0005  (C0186, 0.9975070362104175)  (C0159, 0.9969876736870633)   
C0006  (C0168, 0.9754290315127764)  (C0048, 0.9627599759935872)   
C0007   (C0159, 0.988544915655898)  (C0005, 0.9868551577843031)   
C0008   (C0109, 0.925172862452882)  (C0068, 0.9152451511526559)   
C0009  (C0198, 0.9904450554756707)  (C0063, 0.9786381398416024)   
C0010  (C0166, 0.9832367748277806)   (C0199, 0.973590895201623)   
C0011   (C0107, 0.995063364744912)  (C0048, 0.9949454856521013)   
C0012  (C0155, 0.9905466186001887)  (C0013, 0.9849235776725346)   
C0013  (C0155, 0.9921034257097614)  (C0012, 0.9849235776725346)   
C0014   

* Customer C0001 is most similar to C0107, C0137, and C0191 based on their spending habits and profiles. These recommendations can be used for:
* Suggesting similar products that these customers are likely to purchase.
* Targeting them with similar promotional offers.
