In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
folder_path = r"C:/Users/lenovo/Desktop/EDA_folder"

# Loading CSV files
customers = pd.read_csv(f"{folder_path}/Customers.csv")
products = pd.read_csv(f"{folder_path}/Products.csv")
transactions = pd.read_csv(f"{folder_path}/Transactions.csv")

# Merge transactions with product details
transactions = transactions.merge(products, on="ProductID", how="left")

# Aggregate transaction data for each customer
customer_profile = transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    num_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean"),
    unique_products=("ProductID", "nunique"),
    most_common_category=("Category", lambda x: x.mode()[0] if not x.mode().empty else np.nan),
    total_quantity=("Quantity", "sum"),
    avg_quantity=("Quantity", "mean"),
    max_transaction_value=("TotalValue", "max"),
    min_transaction_value=("TotalValue", "min"),
    std_transaction_value=("TotalValue", "std")
).reset_index()

# Merge customer profile with customer information
data = customers.merge(customer_profile, on="CustomerID", how="left")

# Encode categorical variables
region_dummies = pd.get_dummies(data["Region"], prefix="Region")
category_dummies = pd.get_dummies(data["most_common_category"], prefix="Category")
data = pd.concat([data, region_dummies, category_dummies], axis=1)

# Select relevant features for similarity calculation
features = ["total_spent", "num_transactions", "avg_transaction_value", "unique_products" , "total_quantity", "avg_quantity",
    "max_transaction_value", "min_transaction_value", "std_transaction_value"] + list(region_dummies.columns) + list(category_dummies.columns)

# Fill NaNs with 0 and scale data
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features].fillna(0))

# Compute similarity matrix
similarity_matrix = cosine_similarity(data[features])
similarity_df = pd.DataFrame(similarity_matrix, index=data.CustomerID, columns=data.CustomerID)

# Find top 3 lookalikes for each of the first 20 customers
lookalike_dict = {}
target_customers = data.CustomerID[:20]
for cust_id in target_customers:
    similar_customers = similarity_df.loc[cust_id].sort_values(ascending=False).iloc[1:4]
    lookalike_dict[cust_id] = [(cust, round(score, 4)) for cust, score in similar_customers.items()]

# Save lookalike results to CSV
lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient='index')
lookalike_df.to_csv("Lookalike.csv", header=False)

# Display the first few results
print("\nTop 3 Lookalikes for Each of the First 20 Customers with Similarity Scores:")
for cust_id, lookalikes in lookalike_dict.items():
    print(f"Customer {cust_id}: {lookalikes}")



Top 3 Lookalikes for Each of the First 20 Customers with Similarity Scores:
Customer C0001: [('C0190', 0.9473), ('C0181', 0.9047), ('C0192', 0.8912)]
Customer C0002: [('C0056', 0.9328), ('C0088', 0.8888), ('C0106', 0.8111)]
Customer C0003: [('C0052', 0.8199), ('C0195', 0.7344), ('C0163', 0.731)]
Customer C0004: [('C0155', 0.8466), ('C0087', 0.8405), ('C0153', 0.7401)]
Customer C0005: [('C0146', 0.9622), ('C0186', 0.9569), ('C0007', 0.9174)]
Customer C0006: [('C0168', 0.9672), ('C0171', 0.8985), ('C0011', 0.8593)]
Customer C0007: [('C0115', 0.9271), ('C0005', 0.9174), ('C0146', 0.8675)]
Customer C0008: [('C0065', 0.8233), ('C0059', 0.7823), ('C0156', 0.6849)]
Customer C0009: [('C0103', 0.9108), ('C0198', 0.8412), ('C0083', 0.8155)]
Customer C0010: [('C0111', 0.9486), ('C0062', 0.9104), ('C0061', 0.757)]
Customer C0011: [('C0191', 0.8961), ('C0187', 0.8798), ('C0137', 0.8763)]
Customer C0012: [('C0113', 0.9506), ('C0104', 0.8959), ('C0163', 0.8732)]
Customer C0013: [('C0099', 0.9628), (

In [8]:
import pandas as pd

# Data: Customer ID and their top 3 lookalikes with similarity scores
lookalike_data = {
    'cust_id': ['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008', 'C0009', 'C0010', 
                'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020'],
    'lookalikes': [
        [('C0190', 0.9473), ('C0181', 0.9047), ('C0192', 0.8912)],
        [('C0056', 0.9328), ('C0088', 0.8888), ('C0106', 0.8111)],
        [('C0052', 0.8199), ('C0195', 0.7344), ('C0163', 0.731)],
        [('C0155', 0.8466), ('C0087', 0.8405), ('C0153', 0.7401)],
        [('C0146', 0.9622), ('C0186', 0.9569), ('C0007', 0.9174)],
        [('C0168', 0.9672), ('C0171', 0.8985), ('C0011', 0.8593)],
        [('C0115', 0.9271), ('C0005', 0.9174), ('C0146', 0.8675)],
        [('C0065', 0.8233), ('C0059', 0.7823), ('C0156', 0.6849)],
        [('C0103', 0.9108), ('C0198', 0.8412), ('C0083', 0.8155)],
        [('C0111', 0.9486), ('C0062', 0.9104), ('C0061', 0.757)],
        [('C0191', 0.8961), ('C0187', 0.8798), ('C0137', 0.8763)],
        [('C0113', 0.9506), ('C0104', 0.8959), ('C0163', 0.8732)],
        [('C0099', 0.9628), ('C0108', 0.9243), ('C0107', 0.7525)],
        [('C0060', 0.9697), ('C0097', 0.8152), ('C0128', 0.7995)],
        [('C0036', 0.9283), ('C0131', 0.8736), ('C0125', 0.8301)],
        [('C0042', 0.8881), ('C0067', 0.8542), ('C0183', 0.8509)],
        [('C0075', 0.9535), ('C0041', 0.8437), ('C0023', 0.7917)],
        [('C0117', 0.8217), ('C0046', 0.7843), ('C0122', 0.6473)],
        [('C0121', 0.9177), ('C0119', 0.7009), ('C0132', 0.6147)],
        [('C0050', 0.92), ('C0120', 0.778), ('C0185', 0.7627)]
    ]
}

# Convert to DataFrame
lookalike_df = pd.DataFrame(lookalike_data)

# Save to CSV
lookalike_df.to_csv('Sanya_Arora_Lookalike.csv', index=False)
