<a href="https://colab.research.google.com/github/RabeenaRasulla/eCommerce-Transactions-Dataset/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the datasets (assuming they are in the same directory)
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge DataFrames
df = transactions.merge(customers, on='CustomerID')
df = df.merge(products, on='ProductID')


# Create a user-item matrix (simplified)
# This version uses binary values (1 if purchased, 0 otherwise)
user_item_matrix = df.pivot_table(index='CustomerID', columns='ProductID', values='Quantity', fill_value=0)

# Handle empty cells before TF-IDF
user_item_matrix = user_item_matrix.fillna(0)  # Replace missing values with 0

# Create a TF-IDF vectorizer with a custom stop words list (optional)
stop_words = ['the', 'a', 'an', 'in', 'of']  # Example stop words (can be adjusted)
vectorizer = TfidfVectorizer(min_df=1, stop_words=stop_words)

# Apply TF-IDF to the user-item matrix
tfidf_matrix = vectorizer.fit_transform(user_item_matrix)


# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Find lookalike customers (simplified)
def find_lookalikes(customer_id, cosine_sim_matrix, n_neighbors=3):
    idx = customers[customers['CustomerID'] == customer_id].index[0]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    lookalikes = sim_scores[1:n_neighbors+1]  # Exclude the customer itself
    return lookalikes

# Create a list to store the data for the lookalike DataFrame
lookalike_data = []

for customer_id, lookalikes in lookalike_customers.items():
    for lookalike_customer, score in lookalikes:
        lookalike_data.append({  # Append data as a dictionary to the list
            'CustomerID': customer_id,
            'Lookalike_CustomerID': lookalike_customer,
            'Similarity_Score': score
        })

# Create the DataFrame outside the loop using pd.DataFrame()
lookalike_df = pd.DataFrame(lookalike_data)

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model created successfully.")
print("Results saved to Lookalike.csv")
display(lookalike_df)

Lookalike model created successfully.
Results saved to Lookalike.csv


Unnamed: 0,CustomerID,Lookalike_CustomerID,Similarity_Score
0,C0001,C0002,0.0
1,C0001,C0003,0.0
2,C0001,C0004,0.0
3,C0002,C0001,0.0
4,C0002,C0003,0.0
5,C0002,C0004,0.0
6,C0003,C0001,0.0
7,C0003,C0002,0.0
8,C0003,C0004,0.0
9,C0004,C0001,0.0
