Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


Loading Given Datasets

In [2]:
customers_url = "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
products_url = "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
transactions_url = "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"

customers = pd.read_csv(customers_url)
products = pd.read_csv(products_url)
transactions = pd.read_csv(transactions_url)

Step 1:Merging Datasets to get complete data in one set.

In [3]:
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')

Step 2:Create a customer-product matrix


In [4]:
customer_product_matrix = data.pivot_table(index='CustomerID', columns='ProductID', values='Quantity', aggfunc='sum', fill_value=0)

 Step 3: Normalize the matrix
 # StandardScaler is used to normalize the customer-product matrix for uniformity

In [5]:
scaler = StandardScaler()
normalized_matrix = scaler.fit_transform(customer_product_matrix)

Step 4: Compute cosine similarity
# Cosine similarity is calculated to measure the similarity between customers based on product purchase patterns

In [6]:
similarity_matrix = cosine_similarity(normalized_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_product_matrix.index, columns=customer_product_matrix.index)


Step 5:Function to get top 3 lookalikes
# This function identifies the top 'n' similar customers for a given customer_id

In [7]:
def get_top_lookalikes(customer_id, n=3):
    scores = similarity_df[customer_id].sort_values(ascending=False)[1:n+1]
    return list(zip(scores.index, scores.values))

Step 6: Generate Lookalike.csv
# Iterate through the first 20 customers in the dataset to generate their lookalike data
# Transform the lookalike data into a DataFrame for easier export

In [8]:
lookalike_data = {}
for customer_id in customers['CustomerID'][:20]:
    lookalike_data[customer_id] = get_top_lookalikes(customer_id)

lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_data.keys()),
    'Lookalikes': [str(lookalike_data[cust_id]) for cust_id in lookalike_data.keys()]
})
# The results are saved in a CSV file for further analysis or integration
lookalike_df.to_csv('Somu_Likitha_Lookalike.csv', index=False)