In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the customer and transaction data
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge customer data with transaction data based on 'CustomerID'
combined_data = pd.merge(customers, transactions, on="CustomerID", how="left")

# Preview combined data
print(combined_data.head())




  CustomerID      CustomerName         Region  SignupDate TransactionID  \
0      C0001  Lawrence Carroll  South America  2022-07-10        T00015   
1      C0001  Lawrence Carroll  South America  2022-07-10        T00932   
2      C0001  Lawrence Carroll  South America  2022-07-10        T00085   
3      C0001  Lawrence Carroll  South America  2022-07-10        T00445   
4      C0001  Lawrence Carroll  South America  2022-07-10        T00436   

  ProductID      TransactionDate  Quantity  TotalValue   Price  
0      P054  2024-01-19 03:12:55       2.0      114.60   57.30  
1      P022  2024-09-17 09:01:18       3.0      412.62  137.54  
2      P096  2024-04-08 00:01:00       2.0      614.94  307.47  
3      P083  2024-05-07 03:11:44       2.0      911.44  455.72  
4      P029  2024-11-02 17:04:16       3.0     1300.92  433.64  


In [7]:
print(customers.columns)
print(transactions.columns)

Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')


In [9]:
# Calculate TotalSpent and PurchaseFrequency per customer
# Group by 'CustomerID' and aggregate TotalValue for TotalSpent and count transactions for PurchaseFrequency
customer_features = combined_data.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    PurchaseFrequency=('TransactionID', 'count')
).reset_index()

# Preview customer features
print(customer_features.head())




  CustomerID  TotalSpent  PurchaseFrequency
0      C0001     3354.52                  5
1      C0002     1862.74                  4
2      C0003     2725.38                  4
3      C0004     5354.88                  8
4      C0005     2034.24                  3


In [10]:
# Normalize the 'TotalSpent' and 'PurchaseFrequency' features
scaler = StandardScaler()
customer_features[['TotalSpent', 'PurchaseFrequency']] = scaler.fit_transform(
    customer_features[['TotalSpent', 'PurchaseFrequency']]
)

# Preview the scaled data
print(customer_features.head())


  CustomerID  TotalSpent  PurchaseFrequency
0      C0001   -0.051884           0.000000
1      C0002   -0.862714          -0.451294
2      C0003   -0.393842          -0.451294
3      C0004    1.035375           1.353881
4      C0005   -0.769499          -0.902587


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features[['TotalSpent', 'PurchaseFrequency']])

# Preview the similarity matrix (optional)
print(similarity_matrix[:5, :5])  # Print the top-left 5x5 section of the matrix


[[ 1.          0.88608652  0.65752069 -0.60747019  0.64877318]
 [ 0.88608652  1.          0.93185301 -0.90646468  0.92759938]
 [ 0.65752069  0.93185301  1.         -0.99791082  0.99993327]
 [-0.60747019 -0.90646468 -0.99791082  1.         -0.99859057]
 [ 0.64877318  0.92759938  0.99993327 -0.99859057  1.        ]]


In [12]:
import numpy as np

# Create a function to recommend the top N similar customers
def recommend_similar_customers(similarity_matrix, customer_index, top_n=3):
    # Get the similarity scores for the given customer
    similarity_scores = similarity_matrix[customer_index]
    
    # Get indices of the top N similar customers (excluding the customer itself)
    similar_customer_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
    
    # Get customer IDs and similarity scores
    similar_customers = [(customer_features.iloc[i]["CustomerID"], similarity_scores[i]) for i in similar_customer_indices]
    
    return similar_customers

# Example: Recommend top 3 similar customers for the first customer in the dataset
top_similar_customers = recommend_similar_customers(similarity_matrix, 0)
print(top_similar_customers)


[('C0076', 1.0), ('C0152', 1.0), ('C0164', 1.0)]


In [13]:
# Create a mapping of CustomerID to its top N similar customers
lookalike_data = {}
for idx, row in customer_features.iterrows():
    similar_customers = recommend_similar_customers(similarity_matrix, idx)
    lookalike_data[row["CustomerID"]] = similar_customers

# Convert the results to a DataFrame
lookalike_df = pd.DataFrame([(cust_id, similar_cust[0], similar_cust[1]) 
                             for cust_id, similar_customers in lookalike_data.items() 
                             for similar_cust in similar_customers],
                            columns=["CustomerID", "LookalikeCustomerID", "SimilarityScore"])

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

# Preview the output
print(lookalike_df.head())


  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0076         1.000000
1      C0001               C0152         1.000000
2      C0001               C0164         1.000000
3      C0002               C0029         0.999816
4      C0002               C0199         0.999488
