In [2]:
import pandas as pd

# Load the datasets
customers = pd.read_csv('C:/Users/shara/OneDrive/Desktop/Zeotap/Customers.csv')
products = pd.read_csv('C:/Users/shara/OneDrive/Desktop/Zeotap/Products.csv')
transactions = pd.read_csv('C:/Users/shara/OneDrive/Desktop/Zeotap/Transactions.csv')

# Merge customers and transactions
customer_transactions = pd.merge(transactions, customers, on='CustomerID', how='left')
# Merge with products
customer_transactions = pd.merge(customer_transactions, products, on='ProductID', how='left')

# Display the merged dataset
print(customer_transactions.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Example of creating customer profiles based on product purchase quantities
customer_profiles = customer_transactions.pivot_table(index='CustomerID', columns='ProductID', values='Quantity', aggfunc='sum').fillna(0)

# Standardize the profiles
scaler = StandardScaler()
customer_profiles_scaled = scaler.fit_transform(customer_profiles)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_profiles_scaled)

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles.index, columns=customer_profiles.index)
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.048829 -0.061476 -0.079060 -0.051689 -0.064034   
C0002      -0.048829  1.000000 -0.035699 -0.051683 -0.023066 -0.033697   
C0003      -0.061476 -0.035699  1.000000  0.040222  0.244296 -0.046598   
C0004      -0.079060 -0.051683  0.040222  1.000000  0.079853 -0.065466   
C0005      -0.051689 -0.023066  0.244296  0.079853  1.000000 -0.032509   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.155253 -0.070888 -0.045940 -0.051430  ...  0.020577 -0.056895   
C0002      -0.022453  0.247217 -0.001006 -0.028179  ... -0.037930 -0.026486   
C0003      -0.033438  0.235872 -0.038271 -0.037193  ... -0.049557 -0.042124   
C0004      -0.049865 -0.013040 -0.098337  0.041381  ... -0.066124 -0.065825   
C0005  

In [None]:
# Function to get top N similar customers
def get_top_n_similar_customers(customer_id, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).head(top_n + 1).iloc[1:]
    return similar_customers

# Generate recommendations for the first 20 customers
lookalike_results = {}
for customer_id in similarity_df.index[:20]:
    top_similar_customers = get_top_n_similar_customers(customer_id)
    lookalike_results[customer_id] = top_similar_customers

# Convert the results to the required format
lookalike_list = []
for cust_id, similar in lookalike_results.items():
    for sim_cust_id, score in similar.items():
        lookalike_list.append({'cust_id': cust_id, 'similar_cust_id': sim_cust_id, 'score': score})

lookalike_df = pd.DataFrame(lookalike_list)
print(lookalike_df.head())
