In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# For lookalike modeling
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import gdown

# Google Drive file URLs and output file names
files_to_download = {
    "https://drive.google.com/file/d/1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE/view?usp=sharing": "Customers.csv",
    "https://drive.google.com/file/d/1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0/view?usp=sharing": "Products.csv",
    "https://drive.google.com/file/d/1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF/view?usp=sharing": "Transactions.csv"
}

# Function to extract file ID and download
def download_from_drive(url, output):
    file_id = url.split('/d/')[1].split('/view')[0]
    download_url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(download_url, output, quiet=False)

# Download each file
for url, output in files_to_download.items():
    download_from_drive(url, output)

print("Files downloaded successfully!")


Downloading...
From: https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE
To: /content/Customers.csv
100%|██████████| 8.54k/8.54k [00:00<00:00, 8.43MB/s]
Downloading...
From: https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0
To: /content/Products.csv
100%|██████████| 4.25k/4.25k [00:00<00:00, 9.70MB/s]
Downloading...
From: https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF
To: /content/Transactions.csv
100%|██████████| 54.7k/54.7k [00:00<00:00, 38.4MB/s]

Files downloaded successfully!





In [3]:
# Read CSV files
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

# Merge Transactions with Customers and Products for broader analysis
transactions_merged = pd.merge(transactions_df, customers_df, on="CustomerID", how="left")
transactions_merged = pd.merge(transactions_merged, products_df, on="ProductID", how="left")

# Convert dates to datetime if not already
transactions_merged['TransactionDate'] = pd.to_datetime(transactions_merged['TransactionDate'])
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])


In [4]:
# Calculate customer-level aggregates from transactions
customer_agg = transactions_merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count'
}).rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'NumTransactions'
}).reset_index()

# Average purchase value
customer_agg['AvgPurchaseValue'] = customer_agg['TotalSpend'] / customer_agg['NumTransactions']

# Merge region info
customer_agg = pd.merge(customer_agg, customers_df[['CustomerID','Region']], on='CustomerID', how='left')

# Encode region (simple one-hot or label encoding)
customer_agg = pd.get_dummies(customer_agg, columns=['Region'])

customer_agg.head()


Unnamed: 0,CustomerID,TotalSpend,NumTransactions,AvgPurchaseValue,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,5,670.904,False,False,False,True
1,C0002,1862.74,4,465.685,True,False,False,False
2,C0003,2725.38,4,681.345,False,False,False,True
3,C0004,5354.88,8,669.36,False,False,False,True
4,C0005,2034.24,3,678.08,True,False,False,False


In [5]:
# Prepare feature matrix
feature_cols = ['TotalSpend', 'NumTransactions', 'AvgPurchaseValue'] + \
               [col for col in customer_agg.columns if col.startswith('Region_')]
X = customer_agg[feature_cols].values

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Cosine similarity
similarity_matrix = cosine_similarity(X_scaled)

# Build a mapping from CustomerID to index
cust_id_to_idx = dict(zip(customer_agg['CustomerID'], customer_agg.index))
idx_to_cust_id = dict(zip(customer_agg.index, customer_agg['CustomerID']))


In [6]:
lookalike_results = {}

for cust_id in [f"C{str(i).zfill(4)}" for i in range(1, 21)]:
    if cust_id not in cust_id_to_idx:
        # If some customer doesn't exist in your dataset, skip or handle
        lookalike_results[cust_id] = []
        continue

    idx = cust_id_to_idx[cust_id]

    # Similarity scores for that customer to all others
    scores = similarity_matrix[idx]

    # Sort by similarity descending, exclude self
    similar_indices = np.argsort(scores)[::-1]  # descending
    similar_indices = similar_indices[similar_indices != idx]  # remove self

    top_3 = similar_indices[:3]

    # Build list of (cust_id, score)
    result_list = []
    for t in top_3:
        result_list.append((idx_to_cust_id[t], round(float(scores[t]),4)))

    lookalike_results[cust_id] = result_list

# Convert to DataFrame or directly to CSV
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': [val for val in lookalike_results.values()]
})

lookalike_df.head(10)


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0137, 0.9999), (C0152, 0.9999), (C0107, 0.9..."
1,C0002,"[(C0142, 0.9862), (C0088, 0.9799), (C0043, 0.9..."
2,C0003,"[(C0133, 0.9951), (C0052, 0.9898), (C0112, 0.9..."
3,C0004,"[(C0113, 0.9864), (C0108, 0.9827), (C0102, 0.9..."
4,C0005,"[(C0159, 0.9997), (C0186, 0.9871), (C0123, 0.9..."
5,C0006,"[(C0158, 0.9757), (C0168, 0.9714), (C0171, 0.9..."
6,C0007,"[(C0140, 0.9785), (C0092, 0.9645), (C0193, 0.9..."
7,C0008,"[(C0109, 0.9759), (C0139, 0.9689), (C0098, 0.9..."
8,C0009,"[(C0121, 0.9862), (C0010, 0.9751), (C0060, 0.9..."
9,C0010,"[(C0199, 0.994), (C0111, 0.9801), (C0009, 0.97..."


In [9]:
# Adjust the DataFrame format
formatted_lookalikes = [
    [customer_id, lookalikes]
    for customer_id, lookalikes in lookalike_results.items()
]

# Convert to a DataFrame without column names
formatted_lookalike_df = pd.DataFrame(formatted_lookalikes)

# Save to a CSV without header or index
formatted_lookalike_df.to_csv("Vishal_Singh_Lookalike.csv", index=False, header=False)
