## Setup and Load Datasets

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv("Customers.csv")  # Replace with your file path
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Display the first few rows of each dataset
print("Customers Dataset:")
print(customers.head())

print("\nProducts Dataset:")
print(products.head())

print("\nTransactions Dataset:")
print(transactions.head())


Customers Dataset:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Dataset:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Dataset:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166   

## Merge and Preprocess Data
### Combine the datasets into a single DataFrame and create useful features.

In [2]:
# Merge Transactions with Products
transactions_products = transactions.merge(products, on="ProductID", how="left")

# Merge with Customers
full_data = transactions_products.merge(customers, on="CustomerID", how="left")

# Display merged data
print("\nMerged Dataset:")
print(full_data.head())



Merged Dataset:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins   

### Feature Engineering:

In [3]:
# Total spending per customer
customer_spending = full_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)

# Average transaction value per customer
avg_transaction = full_data.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction.rename(columns={'TotalValue': 'AvgTransactionValue'}, inplace=True)

# Combine features into a single DataFrame
customer_features = customers.merge(customer_spending, on='CustomerID', how='left')
customer_features = customer_features.merge(avg_transaction, on='CustomerID', how='left')

# Fill missing values with 0
customer_features.fillna(0, inplace=True)
print("\nCustomer Features:")
print(customer_features.head())



Customer Features:
  CustomerID        CustomerName         Region  SignupDate  TotalSpending  \
0      C0001    Lawrence Carroll  South America  2022-07-10        3354.52   
1      C0002      Elizabeth Lutz           Asia  2022-02-13        1862.74   
2      C0003      Michael Rivera  South America  2024-03-07        2725.38   
3      C0004  Kathleen Rodriguez  South America  2022-10-09        5354.88   
4      C0005         Laura Weber           Asia  2022-08-15        2034.24   

   AvgTransactionValue  
0              670.904  
1              465.685  
2              681.345  
3              669.360  
4              678.080  


## Normalize Features

In [5]:


# Normalize Features
scaler = StandardScaler()
# Select only numerical features for scaling
numerical_features = customer_features.select_dtypes(include=['number']).iloc[:, 1:] 
customer_features_scaled = scaler.fit_transform(numerical_features)  # Scale numerical columns


## Compute Similarity and Generate Lookalikes

In [6]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features_scaled)

# Generate top 3 lookalike recommendations
lookalike_dict = {}

for idx, customer_id in enumerate(customer_features['CustomerID']):
    # Sort similarities in descending order (skip self-similarity)
    similar_indices = np.argsort(-similarity_matrix[idx])[1:4]
    similar_customers = customer_features.iloc[similar_indices]['CustomerID']
    similarity_scores = similarity_matrix[idx][similar_indices]
    
    # Save results to dictionary
    lookalike_dict[customer_id] = list(zip(similar_customers, similarity_scores))

# Preview the lookalike recommendations
for key, value in list(lookalike_dict.items())[:5]:  # Show first 5 customers
    print(f"Customer {key} -> Lookalikes: {value}")


Customer C0001 -> Lookalikes: [('C0095', 1.0), ('C0094', 1.0), ('C0093', 1.0)]
Customer C0002 -> Lookalikes: [('C0095', 1.0), ('C0094', 1.0), ('C0093', 1.0)]
Customer C0003 -> Lookalikes: [('C0095', 1.0), ('C0094', 1.0), ('C0093', 1.0)]
Customer C0004 -> Lookalikes: [('C0095', 1.0), ('C0094', 1.0), ('C0093', 1.0)]
Customer C0005 -> Lookalikes: [('C0095', 1.0), ('C0094', 1.0), ('C0093', 1.0)]


## Save Results

In [7]:
# Create Lookalike.csv
lookalike_list = []

for cust_id, lookalikes in lookalike_dict.items():
    for lookalike_id, score in lookalikes:
        lookalike_list.append([cust_id, lookalike_id, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("\nLookalike.csv created successfully!")



Lookalike.csv created successfully!
