In [26]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')

In [27]:
customer=pd.read_csv('Customers.csv')
transactions=pd.read_csv('Transactions.csv')
products=pd.read_csv('Products.csv')

In [28]:
customer.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [29]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [30]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [31]:
products.drop(columns= ['Price'], inplace=True)

In [32]:
df = transactions.merge(products, on='ProductID', how='left')
df = df.merge(customer, on='CustomerID', how='left')
df

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,ProductName,Category,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,Timothy Perez,Europe,2022-03-15
...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,SoundWave Smartwatch,Electronics,Jacob Holt,South America,2022-01-22
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,SoundWave Smartwatch,Electronics,Mrs. Kimberly Wright,North America,2024-04-07
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,SoundWave Smartwatch,Electronics,Tyler Haynes,North America,2024-09-21
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,SoundWave Smartwatch,Electronics,Joshua Hamilton,Asia,2024-11-11


In [33]:
# Aggregate transaction data per customer
customer_features = df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Region': 'first'
}).rename(columns={'TotalValue': 'Total_Spend', 'TransactionID': 'Transaction_Count'})

customer_features

Unnamed: 0_level_0,Total_Spend,Transaction_Count,Category,Region
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,3354.52,5,Electronics,South America
C0002,1862.74,4,Clothing,Asia
C0003,2725.38,4,Home Decor,South America
C0004,5354.88,8,Books,South America
C0005,2034.24,3,Electronics,Asia
...,...,...,...,...
C0196,4982.88,4,Home Decor,Europe
C0197,1928.65,3,Electronics,Europe
C0198,931.83,2,Clothing,Europe
C0199,1979.28,4,Electronics,Europe


In [34]:
# One-hot encode categorical features
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(customer_features[['Category', 'Region']]).toarray()
categorical_df = pd.DataFrame(categorical_features, columns=encoder.get_feature_names_out())
categorical_df

Unnamed: 0,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
194,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
195,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
196,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
197,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [35]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = scaler.fit_transform(customer_features[['Total_Spend', 'Transaction_Count']])
numerical_df = pd.DataFrame(numerical_features, columns=['Total_Spend', 'Transaction_Count'])

# Combine features
final_features = pd.concat([numerical_df, categorical_df], axis=1)
customer_ids = customer_features.index.tolist()

final_features

Unnamed: 0,Total_Spend,Transaction_Count,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,-0.061701,-0.011458,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.877744,-0.467494,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.405857,-0.467494,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.032547,1.356650,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.783929,-0.923530,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
194,0.829053,-0.467494,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
195,-0.841689,-0.923530,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
196,-1.386975,-1.379566,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
197,-0.813993,-0.467494,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [36]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(final_features)
similarity_matrix# Compute cosine similarity
similarity_matrix = cosine_similarity(final_features)
similarity_matrix

array([[ 1.        ,  0.02431756,  0.47149423, ...,  0.02966987,
         0.43930648, -0.01941701],
       [ 0.02431756,  1.        ,  0.21535758, ...,  0.68586976,
         0.31794426,  0.50688162],
       [ 0.47149423,  0.21535758,  1.        , ...,  0.32412198,
         0.20947739, -0.11527488],
       ...,
       [ 0.02966987,  0.68586976,  0.32412198, ...,  1.        ,
         0.67700886,  0.00945868],
       [ 0.43930648,  0.31794426,  0.20947739, ...,  0.67700886,
         1.        , -0.21228218],
       [-0.01941701,  0.50688162, -0.11527488, ...,  0.00945868,
        -0.21228218,  1.        ]])

In [37]:
# Generate lookalike recommendations
lookalikes = {}
for idx, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:4]  # Top 3 excluding self
    similar_customers = [(customer_ids[i], similarity_matrix[idx][i]) for i in similar_indices]
    lookalikes[customer_id] = similar_customers

lookalikes

{'C0001': [('C0190', 0.9899949557115172),
  ('C0048', 0.981832110913663),
  ('C0181', 0.9516044714114102)],
 'C0002': [('C0088', 0.9616103591488296),
  ('C0092', 0.9329633755466786),
  ('C0134', 0.9325549376622166)],
 'C0003': [('C0052', 0.9968629945712856),
  ('C0031', 0.9744104726805907),
  ('C0076', 0.9467270853052325)],
 'C0004': [('C0155', 0.9827698550131683),
  ('C0165', 0.9780408589479195),
  ('C0087', 0.9396729320455953)],
 'C0005': [('C0186', 0.9975402027715655),
  ('C0007', 0.9880651393748947),
  ('C0140', 0.9840811489298769)],
 'C0006': [('C0168', 0.9675883041068373),
  ('C0187', 0.947263479005604),
  ('C0011', 0.9393973702584593)],
 'C0007': [('C0005', 0.9880651393748947),
  ('C0115', 0.9850091478139984),
  ('C0186', 0.9748372445724564)],
 'C0008': [('C0109', 0.8674190254388485),
  ('C0065', 0.8544079581531171),
  ('C0156', 0.790041219651594)],
 'C0009': [('C0062', 0.9872555494751158),
  ('C0198', 0.9847636416681272),
  ('C0010', 0.9708242447261681)],
 'C0010': [('C0062', 0

In [38]:
# Convert to DataFrame and save as Lookalike.csv
lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index')
lookalike_df.to_csv("Lookalike.csv", header=False)
lookalike_df

Unnamed: 0,0,1,2
C0001,"(C0190, 0.9899949557115172)","(C0048, 0.981832110913663)","(C0181, 0.9516044714114102)"
C0002,"(C0088, 0.9616103591488296)","(C0092, 0.9329633755466786)","(C0134, 0.9325549376622166)"
C0003,"(C0052, 0.9968629945712856)","(C0031, 0.9744104726805907)","(C0076, 0.9467270853052325)"
C0004,"(C0155, 0.9827698550131683)","(C0165, 0.9780408589479195)","(C0087, 0.9396729320455953)"
C0005,"(C0186, 0.9975402027715655)","(C0007, 0.9880651393748947)","(C0140, 0.9840811489298769)"
C0006,"(C0168, 0.9675883041068373)","(C0187, 0.947263479005604)","(C0011, 0.9393973702584593)"
C0007,"(C0005, 0.9880651393748947)","(C0115, 0.9850091478139984)","(C0186, 0.9748372445724564)"
C0008,"(C0109, 0.8674190254388485)","(C0065, 0.8544079581531171)","(C0156, 0.790041219651594)"
C0009,"(C0062, 0.9872555494751158)","(C0198, 0.9847636416681272)","(C0010, 0.9708242447261681)"
C0010,"(C0062, 0.9751356626228711)","(C0009, 0.9708242447261681)","(C0111, 0.963915065387696)"


In [39]:
for customer_id, similar_customers in lookalikes.items():
  print("\n ============================================================ \n")
  print(customer_features.loc[customer_features.index.isin([customer_id,similar_customers[0][0],similar_customers[1][0],similar_customers[2][0]])])
  print("\n ============================================================ \n\n\n")



            Total_Spend  Transaction_Count     Category         Region
CustomerID                                                            
C0001           3354.52                  5  Electronics  South America
C0048           3850.94                  5  Electronics  South America
C0181           3347.60                  6  Electronics  South America
C0190           2983.02                  5  Electronics  South America






            Total_Spend  Transaction_Count  Category Region
CustomerID                                                 
C0002           1862.74                  4  Clothing   Asia
C0088           2135.38                  5  Clothing   Asia
C0092           3015.95                  4  Clothing   Asia
C0134           2679.28                  5  Clothing   Asia






            Total_Spend  Transaction_Count    Category         Region
CustomerID                                                           
C0003           2725.38                  4  Home Decor  Sout