<a href="https://colab.research.google.com/github/PawanPPatil/Data-Science-Assignment-eCommerce-Transactions-Dataset/blob/main/Pawan_Patil_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files

# Upload datasets
uploaded = files.upload()

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets to combine all necessary information
merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = merged_data.merge(products, on="ProductID", how="left")

# Preprocessing: Create a customer-product summary for feature extraction
customer_summary = merged_data.groupby("CustomerID").agg({
    'ProductName': lambda x: " ".join(x),  # Combine purchased product names
    'Category': lambda x: " ".join(x),    # Combine product categories
    'Region': 'first',                    # Region of the customer
    'TotalValue': 'sum',                  # Total revenue by the customer
    'Quantity': 'sum'                     # Total quantity purchased
}).reset_index()

# Combine text fields (ProductName and Category) into a single string for each customer
customer_summary['CombinedFeatures'] = (
    customer_summary['ProductName'] + " " + customer_summary['Category'] + " " + customer_summary['Region']
)

# Step 1: Transform textual data into numerical features using TF-IDF
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(customer_summary['CombinedFeatures'])

# Step 2: Calculate cosine similarity between all customers
similarity_matrix = cosine_similarity(feature_matrix)

# Function to get top 3 similar customers for a given customer
def get_top_3_similar_customers(customer_index, similarity_matrix, customer_ids):
    similar_indices = similarity_matrix[customer_index].argsort()[::-1][1:4]  # Skip self-match (highest similarity)
    similar_customers = [
        (customer_ids[i], similarity_matrix[customer_index][i]) for i in similar_indices
    ]
    return similar_customers

# Step 3: Generate the lookalike recommendations for the first 20 customers (C0001 - C0020)
lookalike_map = {}
customer_ids = customer_summary['CustomerID'].tolist()

for i in range(20):  # First 20 customers
    lookalikes = get_top_3_similar_customers(i, similarity_matrix, customer_ids)
    lookalike_map[customer_ids[i]] = lookalikes

# Save the Lookalike Map to CSV
lookalike_data = []

for cust_id, lookalikes in lookalike_map.items():
    for similar_id, score in lookalikes:
        lookalike_data.append({"CustomerID": cust_id, "SimilarCustomerID": similar_id, "Score": score})

lookalike_df = pd.DataFrame(lookalike_data)

# Save the file to the Google Colab environment
lookalike_df.to_csv('Lookalike.csv', index=False)

# Download the Lookalike recommendations file
files.download('Lookalike.csv')

# Display Lookalike Recommendations for verification
print(lookalike_df.head(20))


Saving Transactions.csv to Transactions.csv
Saving Products.csv to Products.csv
Saving Customers.csv to Customers.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   CustomerID SimilarCustomerID     Score
0       C0001             C0197  0.763604
1       C0001             C0026  0.719761
2       C0001             C0129  0.712534
3       C0002             C0133  0.836941
4       C0002             C0173  0.781313
5       C0002             C0022  0.710250
6       C0003             C0181  0.746779
7       C0003             C0085  0.727803
8       C0003             C0164  0.720365
9       C0004             C0118  0.777986
10      C0004             C0008  0.753547
11      C0004             C0075  0.745530
12      C0005             C0128  0.741027
13      C0005             C0096  0.710436
14      C0005             C0007  0.682733
15      C0006             C0187  0.805575
16      C0006             C0191  0.682047
17      C0006             C0139  0.668969
18      C0007             C0045  0.693040
19      C0007             C0181  0.692700
