In [1]:
!pip install pandas matplotlib seaborn scikit-learn



**Task 2: Lookalike Model**

In [44]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# Load datasets
customers = pd.read_csv('/kaggle/input/ecommerce-transactions-dataset/Customers.csv')
products = pd.read_csv('/kaggle/input/ecommerce-transactions-dataset/Products.csv')
transactions = pd.read_csv('/kaggle/input/ecommerce-transactions-dataset/Transactions.csv')


In [45]:
# Merge the datasets on CustomerID and ProductID
merged_df = pd.merge(transactions, customers, on="CustomerID", how="left")
merged_df = pd.merge(merged_df, products, on="ProductID", how="left")


In [46]:
# Encode categorical variables (Region, Product Category)
label_encoder = LabelEncoder()
merged_df['Region_encoded'] = label_encoder.fit_transform(merged_df['Region'])
merged_df['Category_encoded'] = label_encoder.fit_transform(merged_df['Category'])


In [48]:
print(merged_df.columns)


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y', 'Region_encoded',
       'Category_encoded'],
      dtype='object')


In [49]:
# Check the first few rows to see if there are any duplicate 'Price' columns
print(merged_df.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  Region_encoded  \
0  ComfortLiving Bluetooth Speaker  Electronics   300.68               1   


In [53]:
# Step 1: Merge the datasets on CustomerID and ProductID
merged_df = pd.merge(transactions, customers, on="CustomerID", how="left")
merged_df = pd.merge(merged_df, products, on="ProductID", how="left")

# Rename Price columns to avoid confusion
merged_df.rename(columns={'Price_y': 'ProductPrice'}, inplace=True)  # Using the correct price column

# Step 2: Encode categorical variables (Region, Product Category)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Encoding 'Region' and 'Category' columns
merged_df['Region_encoded'] = label_encoder.fit_transform(merged_df['Region'])
merged_df['Category_encoded'] = label_encoder.fit_transform(merged_df['Category'])

# Check if encoding worked and columns exist
print(merged_df.columns)

# Step 3: Aggregate transaction data by CustomerID (frequency, total value)
agg_df = merged_df.groupby('CustomerID').agg({
    'Region_encoded': 'first',  # Just take the first Region (as it's static for each customer)
    'Category_encoded': 'mean',  # Average of product categories bought
    'ProductPrice': 'sum',  # Total money spent by customer (using the correct column name)
    'Quantity': 'sum'  # Total quantity purchased
}).reset_index()

# Check the result of the aggregation
print(agg_df.head())


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'ProductPrice',
       'Region_encoded', 'Category_encoded'],
      dtype='object')
  CustomerID  Region_encoded  Category_encoded  ProductPrice  Quantity
0      C0001               3          1.800000       1391.67        12
1      C0002               0          2.000000        835.68        10
2      C0003               3          2.250000        782.83        14
3      C0004               3          1.625000       1925.09        23
4      C0005               0          2.333333        874.81         7


In [56]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# Step 4: Calculate Similarity (Cosine Similarity)
# Create a matrix with customer features
customer_features = agg_df[['Region_encoded', 'Category_encoded', 'ProductPrice', 'Quantity']].values

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(customer_features)

# Step 5: Find the top 3 similar customers for each customer
lookalike_map = defaultdict(list)
for i in range(len(agg_df)):  # Loop through all customers
    similarities = list(enumerate(similarity_matrix[i]))
    # Sort the similarities by score (descending) and exclude the customer itself
    sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]  # Top 3
    # Add the top 3 similar customers and their similarity score
    lookalike_map[agg_df['CustomerID'].iloc[i]] = [(agg_df['CustomerID'].iloc[x[0]], x[1]) for x in sorted_similarities]

# Step 6: Prepare the output for CSV
lookalike_data = []
for customer, similar_customers in lookalike_map.items():
    for sim_customer, score in similar_customers:
        lookalike_data.append([customer, sim_customer, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalike_CustomerID', 'Similarity_Score'])

# Step 7: Save the lookalike model to a CSV file
lookalike_df.to_csv("Lookalike_Customers.csv", index=False)

# Check the output
print(lookalike_df.head())


  CustomerID Lookalike_CustomerID  Similarity_Score
0      C0001                C0076          1.000000
1      C0001                C0133          1.000000
2      C0001                C0096          1.000000
3      C0002                C0106          0.999999
4      C0002                C0166          0.999999
