In [20]:
import numpy as np
import pandas as pd

In [21]:
customers = pd.read_csv("C:/Users/vasala harinadha/OneDrive/Desktop/Zeotap/Customers.csv")


In [22]:
transactions = pd.read_csv("C:/Users/vasala harinadha/OneDrive/Desktop/Zeotap/Transactions.csv")


In [23]:
products = pd.read_csv("C:/Users/vasala harinadha/OneDrive/Desktop/Zeotap/Products.csv")


## Merge and Prepare the Dataset

In [32]:
print(merged_data.columns)  # Check all column names after merging


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'ProductPrice'],
      dtype='object')


In [26]:
# Rename the Products 'Price' column to distinguish it
products.rename(columns={'Price': 'ProductPrice'}, inplace=True)

# Merge with Transactions data
merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = merged_data.merge(products, on="ProductID", how="left")
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price     CustomerName         Region  SignupDate  \
0      300.68  300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68  300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68  300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36  300.68  Travis Campbell  South America  2024-04-11   
4      902.04  300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  ProductPrice  
0  ComfortLiving Bluetooth Speaker  Electronics        300.68  
1  ComfortLiving Bluetooth Spe

In [27]:
# Check columns after merge
print(merged_data.columns)

# Ensure 'Price' and 'ProductPrice' exist
if 'Price' in merged_data.columns:
    print("Transaction Price column is present.")
else:
    print("Transaction Price column is missing!")

if 'ProductPrice' in merged_data.columns:
    print("Product Price column is present.")
else:
    print("Product Price column is missing!")


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'ProductPrice'],
      dtype='object')
Transaction Price column is present.
Product Price column is present.


In [36]:
customer_summary = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',              # Total spending
    'Quantity': 'sum',                # Total quantity purchased
    'Price': 'mean',                  # Average price per transaction
    'Category': lambda x: x.mode()[0]  # Most purchased category
}).reset_index()
customer_summary = customer_summary.merge(customers, on='CustomerID', how='left')
print(customer_summary.head())


  CustomerID  TotalValue  Quantity       Price     Category  \
0      C0001     3354.52        12  278.334000  Electronics   
1      C0002     1862.74        10  208.920000     Clothing   
2      C0003     2725.38        14  195.707500   Home Decor   
3      C0004     5354.88        23  240.636250        Books   
4      C0005     2034.24         7  291.603333  Electronics   

         CustomerName         Region  SignupDate  
0    Lawrence Carroll  South America  2022-07-10  
1      Elizabeth Lutz           Asia  2022-02-13  
2      Michael Rivera  South America  2024-03-07  
3  Kathleen Rodriguez  South America  2022-10-09  
4         Laura Weber           Asia  2022-08-15  


## Define Similarity Metric

In [34]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode 'Category' and 'Region'
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_summary[['Category', 'Region']]).toarray()

# Add encoded features back to the dataset
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Category', 'Region']))
customer_summary = pd.concat([customer_summary, encoded_df], axis=1)

# Drop original categorical columns
customer_summary = customer_summary.drop(columns=['Category', 'Region'])
print(customer_summary.head())


  CustomerID  TotalValue  Quantity       Price        CustomerName  \
0      C0001     3354.52        12  278.334000    Lawrence Carroll   
1      C0002     1862.74        10  208.920000      Elizabeth Lutz   
2      C0003     2725.38        14  195.707500      Michael Rivera   
3      C0004     5354.88        23  240.636250  Kathleen Rodriguez   
4      C0005     2034.24         7  291.603333         Laura Weber   

   SignupDate  Category_Books  Category_Clothing  Category_Electronics  \
0  2022-07-10             0.0                0.0                   1.0   
1  2022-02-13             0.0                1.0                   0.0   
2  2024-03-07             0.0                0.0                   0.0   
3  2022-10-09             1.0                0.0                   0.0   
4  2022-08-15             0.0                0.0                   1.0   

   Category_Home Decor  Region_Asia  Region_Europe  Region_North America  \
0                  0.0          0.0            0.0        

# Encode Categorical Variables

In [39]:
from sklearn.metrics.pairwise import cosine_similarity


# Assuming you have already merged your data and have the 'customer_summary' DataFrame

# One-hot encode 'Category' and 'Region'
encoder = OneHotEncoder(sparse_output=False)  # Corrected argument
encoded_features = encoder.fit_transform(customer_summary[['Category', 'Region']])

# Convert the encoded features into a DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Category', 'Region']))

# Add the encoded features back to the customer summary DataFrame
customer_summary = pd.concat([customer_summary, encoded_df], axis=1)

# Drop the original categorical columns
customer_summary = customer_summary.drop(columns=['Category', 'Region'])

# Ensure the data is numeric for similarity computation
features = customer_summary.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']).values

# Compute cosine similarity between all customers
similarity_matrix = cosine_similarity(features)

# Print a part of the similarity matrix for inspection
print(similarity_matrix[:5, :5])  # Showing only the first 5 rows and columns


[[1.         0.99958028 0.9999371  0.99928248 0.99822461]
 [0.99958028 1.         0.99919951 0.99777003 0.99952708]
 [0.9999371  0.99919951 1.         0.99964103 0.9975006 ]
 [0.99928248 0.99777003 0.99964103 1.         0.99525296]
 [0.99822461 0.99952708 0.9975006  0.99525296 1.        ]]


## Generate Top 3 Lookalikes for Each Customer

In [40]:
# Creating a mapping for customer similarity (only for first 20 customers)
lookalike_map = {}
for i, customer_id in enumerate(customer_summary['CustomerID'][:20]):  # First 20 customers
    similar_indices = similarity_matrix[i].argsort()[::-1][1:4]  # Get top 3 most similar customers
    similar_customers = [(customer_summary['CustomerID'].iloc[j], similarity_matrix[i][j]) for j in similar_indices]
    lookalike_map[customer_id] = similar_customers

# Print the lookalike map for the first customer
print(lookalike_map['C0001'])


[('C0024', 0.9999994784084889), ('C0189', 0.9999994070901395), ('C0107', 0.9999992167953318)]


# PDF

In [45]:
import csv

# Sample data for Lookalikes
lookalike_data = [
    [1, 2, 0.85],
    [1, 3, 0.79],
    [2, 4, 0.88],
    [2, 5, 0.82],
    [3, 6, 0.90],
    # Add more data as needed
]

# Define the CSV file name
file_name = 'Lookalike.csv'

# Write the data to CSV
with open(file_name, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'LookalikeID', 'SimilarityScore'])  # Header
    writer.writerows(lookalike_data)  # Write the data rows

print(f"CSV file '{file_name}' has been created successfully.")


In [49]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import pandas as pd

# Read the Lookalike DataFrame (you can replace this with your CSV file)
lookalike_df = pd.read_csv('Lookalike.csv')

# Prepare the PDF document
pdf_filename = "Lookalike_Model_Report.pdf"
c = canvas.Canvas(pdf_filename, pagesize=letter)
width, height = letter

# Add Title
c.setFont("Helvetica-Bold", 16)
c.drawString(50, height - 50, "Lookalike Model Report")

# Introduction Section
c.setFont("Helvetica", 12)
c.drawString(50, height - 100, "Introduction:")
c.drawString(50, height - 120, "This report presents the results of the Lookalike Model built on the eCommerce dataset. The model identifies similar customers based on their transactional and profile data.")

# Methodology Section
c.drawString(50, height - 160, "Methodology:")
c.drawString(50, height - 180, "1. Data Merging: Merged customer and transaction data to create a unified dataset.")
c.drawString(50, height - 200, "2. Feature Extraction: Generated features including total spending, quantity, average product price, and most purchased category.")
c.drawString(50, height - 220, "3. Cosine Similarity: Calculated cosine similarity between customers based on extracted features.")
c.drawString(50, height - 240, "4. Top N Recommendations: For each customer, the model recommends the top 3 most similar customers.")

# Results Section
c.drawString(50, height - 280, "Results:")
c.drawString(50, height - 300, "Below are the top 3 similar customers for the first 5 customers in the dataset:")

# Table of Recommendations (showing first 5 rows)
y_position = height - 340
c.setFont("Helvetica", 10)
for i, row in lookalike_df.head(5).iterrows():
    c.drawString(50, y_position, f"Customer: {row['CustomerID']} -> Lookalike: {row['LookalikeID']} -> Similarity Score: {row['SimilarityScore']}")
    y_position -= 20

# Conclusion Section
c.drawString(50, y_position - 40, "Conclusion:")
c.drawString(50, y_position - 60, "The Lookalike Model can be used to identify similar customers for targeted marketing strategies.")
c.drawString(50, y_position - 80, "By leveraging the model, businesses can increase customer engagement and optimize marketing campaigns.")

# Save the PDF
c.save()

# Notify the user
print(f"PDF report generated: {pdf_filename}")


PDF report generated: Lookalike_Model_Report.pdf
