In [32]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')



In [33]:
# Feature Engineering
customer_profiles = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    num_transactions=('TransactionID', 'count'),
    favorite_category=('Category', lambda x: x.mode()[0])
).reset_index()



In [34]:
# Merge with customer data
customer_profiles = pd.merge(customer_profiles, customers, on='CustomerID')

# One-hot encode 'Region'
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['total_spending', 'avg_transaction_value', 'num_transactions']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

# Drop non-numerical columns for clustering
X = customer_profiles.drop(columns=['CustomerID', 'CustomerName', 'SignupDate', 'favorite_category'])


In [35]:

# Perform K-Means Clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust number of clusters as needed
customer_profiles['Cluster'] = kmeans.fit_predict(X)



In [36]:
# Find nearest neighbors within each cluster
lookalike_map = {}
for cluster in customer_profiles['Cluster'].unique():
    cluster_data = customer_profiles[customer_profiles['Cluster'] == cluster]
    cluster_ids = cluster_data['CustomerID'].values
    cluster_features = cluster_data.drop(columns=['CustomerID', 'CustomerName', 'SignupDate', 'favorite_category', 'Cluster']).values

    # Fit Nearest Neighbors model
    nn = NearestNeighbors(n_neighbors=4, metric='euclidean')  # 4 because it includes the customer itself
    nn.fit(cluster_features)

    # Find nearest neighbors for each customer in the cluster
    distances, indices = nn.kneighbors(cluster_features)
    for i, customer_id in enumerate(cluster_ids):
        if customer_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]:  # Only for first 20 customers
            neighbor_ids = [cluster_ids[j] for j in indices[i][1:]]  # Exclude self
            neighbor_scores = [1 / (1 + d) for d in distances[i][1:]]  # Convert distance to similarity score
            lookalike_map[customer_id] = list(zip(neighbor_ids, neighbor_scores))


In [37]:

# Sort the lookalike_map by CustomerID
sorted_lookalike_map = {k: lookalike_map[k] for k in sorted(lookalike_map.keys())}

# Convert sorted_lookalike_map to a DataFrame for better CSV formatting
lookalike_df = pd.DataFrame.from_dict(sorted_lookalike_map, orient='index')
lookalike_df.columns = ['Lookalike1', 'Lookalike2', 'Lookalike3']  # Rename columns

# Save to CSV with column names
lookalike_df.to_csv('Rahul_Kannan_Lookalike.csv', index=True, index_label='CustomerID')

# Print results for first 20 customers in order
for customer_id, similar_customers in sorted_lookalike_map.items():
    print(f"{customer_id}: {similar_customers}")

C0001: [('C0137', 0.9959126526309704), ('C0152', 0.9941795833573235), ('C0107', 0.9520858336674961)]
C0002: [('C0142', 0.9227381553764861), ('C0088', 0.9026520852250701), ('C0134', 0.8784613267418189)]
C0003: [('C0133', 0.9658399822913127), ('C0052', 0.94983219593737), ('C0191', 0.8908775388091351)]
C0004: [('C0113', 0.930282741787746), ('C0102', 0.9030591301959036), ('C0108', 0.8989429590094472)]
C0005: [('C0159', 0.9892476724430751), ('C0186', 0.9279666659374441), ('C0146', 0.8963227171790878)]
C0006: [('C0158', 0.8978962008633448), ('C0171', 0.8816734557255188), ('C0187', 0.8681787559992562)]
C0007: [('C0140', 0.8792447396662206), ('C0092', 0.8788930693880573), ('C0193', 0.8783911724209452)]
C0008: [('C0109', 0.8599775351124415), ('C0139', 0.8270859090113001), ('C0098', 0.8146925791092872)]
C0009: [('C0121', 0.9011458327180681), ('C0010', 0.8586921839815093), ('C0198', 0.8563385169549942)]
C0010: [('C0199', 0.944967079656729), ('C0111', 0.9036531286381025), ('C0103', 0.8830625758311

The Lookalike Model successfully segments customers and identifies similar ones using clustering and nearest neighbor techniques. The methodology provides a scalable and data-driven approach to enhance customer engagement, optimize marketing campaigns, and drive business growth. Future improvements can include incorporating deep learning-based similarity models or additional customer behavioral attributes for enhanced accuracy.