In [1]:
import pandas as pd

In [2]:
customers = pd.read_csv('Customers.csv')       # Replace with the actual path to the Customers dataset
products = pd.read_csv('Products.csv')         # Replace with the actual path to the Products dataset
transactions = pd.read_csv('Transactions.csv') # Replace with the actual path to the Transactions dataset

In [3]:
customer_transactions = pd.merge(transactions, customers, on='CustomerID', how='left')
print(customer_transactions.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price     CustomerName         Region  SignupDate  
0      300.68  300.68   Andrea Jenkins         Europe  2022-12-03  
1      300.68  300.68  Brittany Harvey           Asia  2024-09-04  
2      300.68  300.68  Kathryn Stevens         Europe  2024-04-04  
3      601.36  300.68  Travis Campbell  South America  2024-04-11  
4      902.04  300.68    Timothy Perez         Europe  2022-03-15  


In [7]:
merged_data = pd.merge(customer_transactions, products, on='ProductID', how='left')

print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [10]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionID': 'count',  # Number of transactions
    'Category': lambda x: x.value_counts().to_dict(),  # Product category preferences
    'Region': 'first',  # Region
    'SignupDate': 'first'  # Signup date
}).reset_index()

# Adding derived features
customer_features['AverageSpend'] = customer_features['TotalValue'] / customer_features['TransactionID']
customer_features['SignupYear'] = pd.to_datetime(customer_features['SignupDate']).dt.year

print(customer_features.head())


  CustomerID  TotalValue  TransactionID  \
0      C0001     3354.52              5   
1      C0002     1862.74              4   
2      C0003     2725.38              4   
3      C0004     5354.88              8   
4      C0005     2034.24              3   

                                            Category         Region  \
0    {'Electronics': 3, 'Books': 1, 'Home Decor': 1}  South America   
1                   {'Home Decor': 2, 'Clothing': 2}           Asia   
2  {'Home Decor': 2, 'Clothing': 1, 'Electronics'...  South America   
3    {'Books': 3, 'Home Decor': 3, 'Electronics': 2}  South America   
4                {'Electronics': 2, 'Home Decor': 1}           Asia   

   SignupDate  AverageSpend  SignupYear  
0  2022-07-10       670.904        2022  
1  2022-02-13       465.685        2022  
2  2024-03-07       681.345        2024  
3  2022-10-09       669.360        2022  
4  2022-08-15       678.080        2022  


In [11]:

customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_cols = ['TotalValue', 'AverageSpend', 'SignupYear']
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

print(customer_features.head())


  CustomerID  TotalValue  TransactionID  \
0      C0001    0.308942              5   
1      C0002    0.168095              4   
2      C0003    0.249541              4   
3      C0004    0.497806              8   
4      C0005    0.184287              3   

                                            Category  SignupDate  \
0    {'Electronics': 3, 'Books': 1, 'Home Decor': 1}  2022-07-10   
1                   {'Home Decor': 2, 'Clothing': 2}  2022-02-13   
2  {'Home Decor': 2, 'Clothing': 1, 'Electronics'...  2024-03-07   
3    {'Books': 3, 'Home Decor': 3, 'Electronics': 2}  2022-10-09   
4                {'Electronics': 2, 'Home Decor': 1}  2022-08-15   

   AverageSpend  SignupYear  Region_Europe  Region_North America  \
0      0.474336         0.0          False                 False   
1      0.308940         0.0          False                 False   
2      0.482751         1.0          False                 False   
3      0.473092         0.0          False                 F

In [14]:
# Check that 'SignupYear' is being used for similarity calculation
features = customer_features.drop(columns=['CustomerID', 'Category', 'SignupDate'])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(features)

# Check similarity matrix shape
print(similarity_matrix.shape)


(199, 199)


In [15]:
# Extract the year from SignupDate and drop the original 'SignupDate'
customer_features['SignupYear'] = pd.to_datetime(customer_features['SignupDate']).dt.year

# Exclude non-numeric columns (like 'CustomerID', 'Category', and 'SignupDate') for similarity calculation
features = customer_features.drop(columns=['CustomerID', 'Category', 'SignupDate'])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(features)

# Check similarity matrix shape
print(similarity_matrix.shape)

(199, 199)


In [16]:
# Drop non-numeric columns, including the 'SignupDate' and 'Category' columns
features = customer_features.drop(columns=['CustomerID', 'Category', 'SignupDate'])

# Check the data types to ensure they are all numeric
print(features.dtypes)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(features)

# Check the shape of the similarity matrix
print(similarity_matrix.shape)


TotalValue              float64
TransactionID             int64
AverageSpend            float64
SignupYear                int32
Region_Europe              bool
Region_North America       bool
Region_South America       bool
dtype: object
(199, 199)


In [17]:
# Drop 'CustomerID' and any non-numeric columns (e.g., 'Category' if not one-hot encoded)
numeric_features = customer_features.drop(columns=['CustomerID', 'Category', 'SignupDate'])

# Ensure that we are only passing numerical values
print(numeric_features.dtypes)  # Check the data types

# Now compute the cosine similarity on the numerical columns
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(numeric_features)

# Verify the shape of the similarity matrix
print(similarity_matrix.shape)

# Get top 3 similar customers for each customer
lookalike_results = {}

for i, customer_id in enumerate(customer_features['CustomerID']):
    similar_customers = sorted(
        list(enumerate(similarity_matrix[i])),
        key=lambda x: -x[1]  # Sort by similarity score (descending)
    )[1:4]  # Exclude self and take top 3 most similar customers

    lookalike_results[customer_id] = [(customer_features['CustomerID'][j], round(score, 2)) 
                                      for j, score in similar_customers]

# Check first few results
print(lookalike_results)


TotalValue              float64
TransactionID             int64
AverageSpend            float64
SignupYear                int32
Region_Europe              bool
Region_North America       bool
Region_South America       bool
dtype: object
(199, 199)
{'C0001': [('C0152', np.float64(1.0)), ('C0137', np.float64(1.0)), ('C0107', np.float64(1.0))], 'C0002': [('C0142', np.float64(1.0)), ('C0177', np.float64(1.0)), ('C0027', np.float64(1.0))], 'C0003': [('C0133', np.float64(1.0)), ('C0052', np.float64(1.0)), ('C0192', np.float64(1.0))], 'C0004': [('C0113', np.float64(1.0)), ('C0102', np.float64(1.0)), ('C0104', np.float64(1.0))], 'C0005': [('C0159', np.float64(1.0)), ('C0186', np.float64(1.0)), ('C0007', np.float64(1.0))], 'C0006': [('C0158', np.float64(1.0)), ('C0168', np.float64(1.0)), ('C0133', np.float64(1.0))], 'C0007': [('C0159', np.float64(1.0)), ('C0005', np.float64(1.0)), ('C0115', np.float64(1.0))], 'C0008': [('C0065', np.float64(1.0)), ('C0109', np.float64(1.0)), ('C0175', np.float6

In [18]:
# Dictionary to store the results
lookalike_results = {}

# Iterate over each customer
for i, customer_id in enumerate(customer_features['CustomerID']):
    # Get the similarity scores for the customer
    similar_customers = sorted(
        list(enumerate(similarity_matrix[i])),
        key=lambda x: -x[1]  # Sort by similarity score (descending)
    )[1:4]  # Exclude self (first result) and take top 3

    lookalike_results[customer_id] = [(customer_features['CustomerID'][j], round(score, 2)) 
                                      for j, score in similar_customers]

# View the first 3 lookalikes as a check
print(lookalike_results)


{'C0001': [('C0152', np.float64(1.0)), ('C0137', np.float64(1.0)), ('C0107', np.float64(1.0))], 'C0002': [('C0142', np.float64(1.0)), ('C0177', np.float64(1.0)), ('C0027', np.float64(1.0))], 'C0003': [('C0133', np.float64(1.0)), ('C0052', np.float64(1.0)), ('C0192', np.float64(1.0))], 'C0004': [('C0113', np.float64(1.0)), ('C0102', np.float64(1.0)), ('C0104', np.float64(1.0))], 'C0005': [('C0159', np.float64(1.0)), ('C0186', np.float64(1.0)), ('C0007', np.float64(1.0))], 'C0006': [('C0158', np.float64(1.0)), ('C0168', np.float64(1.0)), ('C0133', np.float64(1.0))], 'C0007': [('C0159', np.float64(1.0)), ('C0005', np.float64(1.0)), ('C0115', np.float64(1.0))], 'C0008': [('C0065', np.float64(1.0)), ('C0109', np.float64(1.0)), ('C0175', np.float64(1.0))], 'C0009': [('C0062', np.float64(1.0)), ('C0132', np.float64(1.0)), ('C0197', np.float64(1.0))], 'C0010': [('C0199', np.float64(1.0)), ('C0166', np.float64(1.0)), ('C0121', np.float64(1.0))], 'C0011': [('C0107', np.float64(1.0)), ('C0048', n

In [19]:
# Save lookalike results for the first 20 customers
output = []
for customer_id in customer_features['CustomerID'][:20]:  # First 20 customers
    lookalikes = lookalike_results.get(customer_id, [])
    row = [customer_id]
    for similar_customer, score in lookalikes:
        row.extend([similar_customer, score])
    output.append(row)

print(output[:3])  # View first few rows


[['C0001', 'C0152', np.float64(1.0), 'C0137', np.float64(1.0), 'C0107', np.float64(1.0)], ['C0002', 'C0142', np.float64(1.0), 'C0177', np.float64(1.0), 'C0027', np.float64(1.0)], ['C0003', 'C0133', np.float64(1.0), 'C0052', np.float64(1.0), 'C0192', np.float64(1.0)]]


In [20]:
import csv

# Save results to a CSV file
with open('YourName_Lookalike.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])
    writer.writerows(output)