In [1]:
#importing required libraries

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
#stroring csv files

customers=pd.read_csv('Customers.csv')
products=pd.read_csv('Products.csv')
transactions=pd.read_csv('Transactions.csv')

In [3]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
# to provide recommendation or find lookalike we will use Region , category bought and signup date


In [7]:
# Merge transactions with products to get product categories
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

In [8]:
# Aggregate transaction data for each customer and category
category_spend = transactions.groupby(['CustomerID', 'Category']).agg(
    CategorySpend=('TotalValue', 'sum')
).reset_index()

# Pivot the data to have separate columns for each category
category_spend_pivot = category_spend.pivot_table(
    index='CustomerID', 
    columns='Category', 
    values='CategorySpend', 
    aggfunc='sum', 
    fill_value=0
)

# Merge category spend data with customer profile
customers = customers.merge(category_spend_pivot, on='CustomerID', how='left')


In [9]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,Books,Clothing,Electronics,Home Decor
0,C0001,Lawrence Carroll,South America,2022-07-10,114.6,0.0,2827.3,412.62
1,C0002,Elizabeth Lutz,Asia,2022-02-13,0.0,1025.46,0.0,837.28
2,C0003,Michael Rivera,South America,2024-03-07,0.0,122.36,1385.2,1217.82
3,C0004,Kathleen Rodriguez,South America,2022-10-09,1888.48,0.0,1355.74,2110.66
4,C0005,Laura Weber,Asia,2022-08-15,0.0,0.0,1180.38,853.86


In [10]:
#Converting categorical value into numerical using OneHotEncoder

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, drop='first')
region_encoded = encoder.fit_transform(customers[['Region']])

region_encoded = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))

customers= pd.concat([customers, region_encoded], axis=1)

customers = customers.drop(columns=['Region'])


In [13]:
# stroring all required featues 

profile_features = customers[['Region_North America', 'Region_Europe', 'Region_South America', 'Clothing', 'Books', 'Electronics', 'Home Decor']]
profile_features.fillna(0,inplace=True)
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(profile_features)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  profile_features.fillna(0,inplace=True)


In [14]:
def recommend_similar_customers(customer_id, top_n=3):
    # Get the index of the customer
    customer_index = customers[customers['CustomerID'] == customer_id].index[0]
    
    # ccalculating similarity score
    similarity_scores = similarity_matrix[customer_index]
    
    #sorting in descending so that we can geet top similar customers
    similar_customers_indices = similarity_scores.argsort()[::-1][1:top_n+1]
    
    # Retrieve the customer IDs and similarity scores
    similar_customers = customers.loc[similar_customers_indices]
    similar_customers['SimilarityScore'] = similarity_scores[similar_customers_indices]
    
    return similar_customers[['CustomerID', 'CustomerName', 'SimilarityScore']]

# Example 
recommended_customers = recommend_similar_customers(customer_id='C0001')
print(recommended_customers)


    CustomerID     CustomerName  SimilarityScore
139      C0140  Gregory Estrada         0.996987
90       C0091        Lisa Kirk         0.993672
68       C0069     Stacy Foster         0.991390


In [17]:
def create_lookalike_csv(start_id='C0001', end_id='C0020', top_n=3):
    # requiered customers to perform operation
    customer_ids = ['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008', 'C0009', 'C0010',
                    'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020']

    lookalike_data = []

    for customer_id in customer_ids:
        similar_customers = recommend_similar_customers(customer_id, top_n)
        
        lookalikes_list = [(row['CustomerID'], row['SimilarityScore']) for _, row in similar_customers.iterrows()]
        
        lookalike_data.append({
            "cust_id": customer_id,
            "lookalikes": lookalikes_list
        })
    
    lookalike_df = pd.DataFrame(lookalike_data)
    
    lookalike_df.to_csv('Lookalike.csv', index=False)

    return lookalike_df

 # for customers C0001 to C0020
lookalike_results = create_lookalike_csv()

In [19]:
lookalike=pd.read_csv('Lookalike.csv')

In [27]:
lookalike.head()

Unnamed: 0,cust_id,lookalikes
0,C0001,"[('C0140', 0.9969872321794545), ('C0091', 0.99..."
1,C0002,"[('C0134', 0.9988525104659369), ('C0143', 0.99..."
2,C0003,"[('C0007', 0.9977578013612544), ('C0163', 0.99..."
3,C0004,"[('C0075', 0.9943484807181324), ('C0146', 0.98..."
4,C0005,"[('C0163', 0.9968855925977347), ('C0007', 0.99..."
