In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('/content/drive/MyDrive/projects/E-Commerce Data Science Project/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/projects/E-Commerce Data Science Project/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/projects/E-Commerce Data Science Project/Transactions.csv')

# Merge products and transactions data on 'ProductID'
product_transactions = pd.merge(products, transactions, on='ProductID', how='left')

# Create a customer-product matrix for similarity computation
customer_product_matrix = pd.pivot_table(product_transactions, index='CustomerID', columns='ProductID', values='Quantity', fill_value=0)

# Standardize the data
scaler = StandardScaler()
standardized_matrix = scaler.fit_transform(customer_product_matrix)

# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(standardized_matrix)

# Convert to DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=customer_product_matrix.index, columns=customer_product_matrix.index)

# Function to find top N lookalikes for a given customer
def get_top_lookalikes(customer_id, similarity_df, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]  # Exclude the customer itself
    return list(zip(similar_customers.index, similar_customers.values))

# Get top 3 lookalikes for first 20 customers (C0001 - C0020)
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:  # Iterate through the first 20 customer IDs
    lookalikes[customer_id] = get_top_lookalikes(customer_id, similarity_df)

# Convert lookalikes into a DataFrame and save to CSV
lookalikes_df = pd.DataFrame({
    'CustomerID': list(lookalikes.keys()),
    'Lookalikes': [v for v in lookalikes.values()]  # Store as list of tuples (customer_id, score)
})

lookalikes_df.to_csv('Lookalike.csv', index=False)
print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv


In [16]:
lookalikes_df.to_csv('/content/drive/MyDrive/projects/E-Commerce Data Science Project/Lookalike.csv', index=False)

In [14]:
print(lookalikes)

{'C0001': [('C0194', 0.403396266993008), ('C0020', 0.3653990212204828), ('C0104', 0.34248752149926986)], 'C0002': [('C0091', 0.4340902480599487), ('C0030', 0.40365337637928855), ('C0071', 0.32063620965199896)], 'C0003': [('C0181', 0.47469321761270017), ('C0134', 0.46854469246418057), ('C0144', 0.4080853545633678)], 'C0004': [('C0070', 0.38358072245428865), ('C0175', 0.3071395013311464), ('C0105', 0.26965547925167827)], 'C0005': [('C0096', 0.48776309944956087), ('C0023', 0.47056277311097094), ('C0055', 0.3780574021622297)], 'C0006': [('C0040', 0.4571832502984092), ('C0196', 0.3841944645035154), ('C0058', 0.37537113954618717)], 'C0007': [('C0079', 0.6176418329255738), ('C0118', 0.47168438980984445), ('C0020', 0.4571595870720856)], 'C0008': [('C0144', 0.3055166514172029), ('C0028', 0.28311706080516774), ('C0165', 0.26642269454273465)], 'C0009': [('C0140', 0.5265061974196308), ('C0083', 0.4931338776850121), ('C0162', 0.47164927440704874)], 'C0010': [('C0094', 0.486442319457847), ('C0143', 