In [2]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv('D:\project\JOB\Customers.csv')
products = pd.read_csv('D:\project\JOB\Products.csv')
transactions = pd.read_csv('D:\project\JOB\Transactions.csv')

print(customers.head(10))
print(products.head(10))
print(transactions.head(10))


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
5      C0006     Brittany Palmer  South America  2024-01-07
6      C0007         Paul Graves           Asia  2022-06-18
7      C0008            David Li  North America  2024-01-13
8      C0009           Joy Clark         Europe  2023-08-14
9      C0010           Aaron Cox         Europe  2022-12-15
  ProductID                  ProductName     Category   Price
0      P001         ActiveWear Biography        Books  169.30
1      P002        ActiveWear Smartwatch  Electronics  346.30
2      P003      ComfortLiving Biography        Books   44.12
3      P004                BookWorld Rug   Home Decor   95.69
4      P005              TechP

In [21]:
merged = pd.merge(transactions, customers, on='CustomerID')

merged = pd.merge(merged, products, on='ProductID')

print(merged.head())

customer_spending = merged.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)

customer_frequency = merged.groupby('CustomerID')['TransactionID'].count().reset_index()
customer_frequency.rename(columns={'TransactionID': 'TransactionCount'}, inplace=True)

preferred_category = merged.groupby(['CustomerID', 'Category'])['TotalValue'].sum().reset_index()
preferred_category = preferred_category.loc[preferred_category.groupby('CustomerID')['TotalValue'].idxmax()]
preferred_category = preferred_category[['CustomerID', 'Category']]

features = pd.merge(customer_spending, customer_frequency, on='CustomerID')
features = pd.merge(features, preferred_category, on='CustomerID')
print(features.head())

features = pd.get_dummies(features, columns=['Category'], prefix='Category', drop_first=True)
print(features.head())



  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [23]:
scaler = StandardScaler()
numerical_cols = ['TotalSpending', 'TransactionCount']
features[numerical_cols] = scaler.fit_transform(features[numerical_cols])
print(features.head())

similarity_matrix = cosine_similarity(features.drop('CustomerID', axis=1))
similarity_df = pd.DataFrame(similarity_matrix, index=features['CustomerID'], columns=features['CustomerID'])
print(similarity_df.head())


  CustomerID  TotalSpending  TransactionCount  Category_Clothing  \
0      C0001      -0.061701         -0.011458              False   
1      C0002      -0.877744         -0.467494               True   
2      C0003      -0.405857         -0.467494              False   
3      C0004       1.032547          1.356650              False   
4      C0005      -0.783929         -0.923530              False   

   Category_Electronics  Category_Home Decor  
0                  True                False  
1                 False                False  
2                  True                False  
3                 False                 True  
4                  True                False  
CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.042117  0.874376 -0.040019  0.672819 -0.032386   
C0002       0.042117  1.000000  0.346529 -0.552655  0.505491 -0.166000   
C0003       0

In [29]:

def get_top_lookalikes(similarity_df, customer_id, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return list(zip(similar_customers.index, similar_customers.values))

lookalikes = {}
for customer_id in features['CustomerID'][:20]:
    lookalikes[customer_id] = get_top_lookalikes(similarity_df, customer_id)

lookalike_df = pd.DataFrame({'CustomerID': lookalikes.keys(),
                             'Lookalikes': [v for v in lookalikes.values()]})
lookalike_df.to_csv('Lookalike.csv', index=False)
print(lookalike_df.head())


  CustomerID                                         Lookalikes
0      C0001  [(C0072, 0.9822210127408428), (C0190, 0.980592...
1      C0002  [(C0029, 0.9997376151231562), (C0010, 0.999094...
2      C0003  [(C0125, 0.9936733367677714), (C0192, 0.968316...
3      C0004  [(C0173, 0.9950009705116309), (C0012, 0.983230...
4      C0005  [(C0112, 0.9997522793167294), (C0186, 0.996929...


In [33]:
import pandas as pd

lookalike_data = {
    'CustomerID': ['C0001', 'C0002'],
    'Lookalikes': [
        [('C0010', 0.95), ('C0020', 0.91), ('C0015', 0.88)],
        [('C0030', 0.89), ('C0040', 0.85), ('C0050', 0.82)]
    ]
}

lookalike_df = pd.DataFrame(lookalike_data)
print(lookalike_df)


  CustomerID                                     Lookalikes
0      C0001  [(C0010, 0.95), (C0020, 0.91), (C0015, 0.88)]
1      C0002  [(C0030, 0.89), (C0040, 0.85), (C0050, 0.82)]


In [11]:
# Transform lookalikes into the required format
lookalike_data = {
    'CustomerID': ['C0001', 'C0002'],
    'Lookalikes': [
        [('C0010', 0.95), ('C0020', 0.91), ('C0015', 0.88)],
        [('C0030', 0.89), ('C0040', 0.85), ('C0050', 0.82)]
    ]
}

lookalike_df = pd.DataFrame(lookalike_data)

rows = []
for index, row in lookalike_df.iterrows():
    customer_id = row['CustomerID']
    lookalikes = row['Lookalikes']  # List of tuples
    lookalike_1, score_1 = lookalikes[0] if len(lookalikes) > 0 else ('', 0)
    lookalike_2, score_2 = lookalikes[1] if len(lookalikes) > 1 else ('', 0)
    lookalike_3, score_3 = lookalikes[2] if len(lookalikes) > 2 else ('', 0)
    rows.append([customer_id, lookalike_1, score_1, lookalike_2, score_2, lookalike_3, score_3])

# Create a new DataFrame
final_df = pd.DataFrame(rows, columns=['cust_id', 'lookalike_1', 'score_1', 'lookalike_2', 'score_2', 'lookalike_3', 'score_3'])

# Save as CSV
final_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

print(final_df.head(10))


  cust_id lookalike_1  score_1 lookalike_2  score_2 lookalike_3  score_3
0   C0001       C0010     0.95       C0020     0.91       C0015     0.88
1   C0002       C0030     0.89       C0040     0.85       C0050     0.82
