In [8]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import json

# Load the datasets
customers = pd.read_csv('D:/DSA/o/Customers.csv')
products = pd.read_csv('D:/DSA/o/Products.csv')
transactions = pd.read_csv('D:/DSA/o/Transactions.csv')

# Rename columns to avoid conflicts and maintain clarity
transactions.rename(columns={'Price': 'TransactionPrice'}, inplace=True)
products.rename(columns={'Price': 'ProductPrice'}, inplace=True)

# Merge transactions with products and customers
transactions = transactions.merge(products, on='ProductID', how='left')
customer_transactions = transactions.merge(customers, on='CustomerID', how='left')

# Feature Engineering
# Aggregate customer data to create meaningful features for similarity calculation
customer_profiles = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',            # Total spending of the customer
    'Quantity': 'sum',              # Total quantity of items purchased
    'ProductPrice': 'mean'          # Average price of products purchased
}).reset_index()

# Standardize numerical features for similarity calculation
scaler = StandardScaler()
numeric_features = customer_profiles[['TotalValue', 'Quantity', 'ProductPrice']]
numeric_features_scaled = scaler.fit_transform(numeric_features)

# Calculate cosine similarity between customer profiles
similarity_matrix = cosine_similarity(numeric_features_scaled)

# Convert the similarity matrix into a DataFrame for easier processing
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

# Define a function to get top similar customers for a given customer
def get_top_similar(customer_id, similarity_df, num_results=3):
    if customer_id not in similarity_df.index:
        return []
    sorted_similarities = similarity_df[customer_id].sort_values(ascending=False)
    top_similar = sorted_similarities.iloc[1:num_results+1]  # Exclude self
    return [(cust_id, round(score, 3)) for cust_id, score in top_similar.items()]

# Get the first 20 customers from the Customers dataset
first_20_customers = customers['CustomerID'].iloc[:20]

# Generate the lookalike map for the first 20 customers
lookalike_map = {
    cust_id: [{"cust_id": similar_cust, "score": score} for similar_cust, score in get_top_similar(cust_id, similarity_df)]
    for cust_id in first_20_customers
}

# Save the lookalike map to a CSV file
lookalike_map_df = pd.DataFrame({
    "CustomerID": list(lookalike_map.keys()),
    "Lookalikes": [json.dumps(value) for value in lookalike_map.values()]
})

# Save the results to a file
lookalike_map_file_path = 'Lookalike_Map.csv'
lookalike_map_df.to_csv(lookalike_map_file_path, index=False)

# Output the file path for download or review
print(f"Lookalike map saved to {lookalike_map_file_path}")

# Display a sample of the lookalike map for review
print(lookalike_map_df.head())


Lookalike map saved to Lookalike_Map.csv
  CustomerID                                         Lookalikes
0      C0001  [{"cust_id": "C0103", "score": 0.998}, {"cust_...
1      C0002  [{"cust_id": "C0029", "score": 1.0}, {"cust_id...
2      C0003  [{"cust_id": "C0111", "score": 0.998}, {"cust_...
3      C0004  [{"cust_id": "C0165", "score": 0.998}, {"cust_...
4      C0005  [{"cust_id": "C0167", "score": 1.0}, {"cust_id...
