In [1]:
#to find lookalikes, i thought of two approaches. one is overkill and
#other is normal.
#the overkill one first: Facebook AI Similarity Search
#uses an Index datastructure to find similar ones, need to store vector embeddings #
# / feature vectors in our case 
#for that purpose.
import pandas as pd
import numpy as np
import faiss
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

customers_df=pd.read_csv('Customers Data.csv')
transactions_df=pd.read_csv('Transactions.csv')
products_df=pd.read_csv('Products.csv')
def create_balanced_features(customers_df, transactions_df, products_df):
    #create balanced feature vectors for customer similarity
    
    # convert dates
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    current_date = transactions_df['TransactionDate'].max()
    
    # 1. Recency Frequency Monetary Score (40% weight)
    rfm = transactions_df.groupby('CustomerID').agg({
        'TransactionDate': lambda x: (current_date - x.max()).days,  
        'TransactionID': 'count',                                    
        'TotalValue': 'sum'                                         
    })
    rfm.columns = ['Recency', 'Frequency', 'Monetary']
    
    # Convert to scores by equally dividing into quintiles and explicitly convert to numeric
    rfm['RecencyScore'] = pd.qcut(rfm['Recency'].rank(method='first'), 
                                 q=5, labels=[5, 4, 3, 2, 1]).astype(float)
    # here what we did is that we used qcut to equally divide into quintiles of 5 groups, 5=very recent=better
    rfm['FrequencyScore'] = pd.qcut(rfm['Frequency'].rank(method='first'), 
                                   q=5, labels=[1, 2, 3, 4, 5]).astype(float)
    rfm['MonetaryScore'] = pd.qcut(rfm['Monetary'].rank(method='first'), 
                                  q=5, labels=[1, 2, 3, 4, 5]).astype(float)
    
    # Order Statistics (20% weight)
    order_stats = transactions_df.groupby('CustomerID').agg({
        'TotalValue': ['mean', 'std'],
        'Quantity': ['mean', 'std']
    }).fillna(0)
    order_stats.columns = ['AvgOrderValue', 'StdOrderValue', 
                          'AvgQuantity', 'StdQuantity']
    
    # Category Preferences (30% weight)
    enriched_transactions = transactions_df.merge(products_df, on='ProductID')
    category_pivot = pd.crosstab(
        enriched_transactions['CustomerID'], 
        enriched_transactions['Category'],
        values=enriched_transactions['TotalValue'],
        aggfunc='sum',
        normalize='index'
    ).fillna(0)
    
    # Purchase Timing (10% weight)
    timing_features = transactions_df.groupby('CustomerID').agg({
        'TransactionDate': [
            lambda x: x.dt.hour.mean(),     # Average purchase hour
            lambda x: x.dt.dayofweek.mean(), # Average day of week
            lambda x: x.dt.dayofweek.std(),  # Variation in purchase day
            lambda x: x.diff().dt.days.mean() # Average days between purchases
        ]
    }).fillna(0)
    timing_features.columns = ['AvgHour', 'AvgDayOfWeek', 
                             'DayVariation', 'PurchaseInterval']
    
    # Multiply features by their weights
    rfm_weighted = rfm[['RecencyScore', 'FrequencyScore', 'MonetaryScore']].multiply(0.4)
    order_stats_weighted = order_stats.multiply(0.2)
    category_pivot_weighted = category_pivot.multiply(0.3)
    timing_features_weighted = timing_features.multiply(0.1)
    
    # Combine all features
    customer_features = pd.concat([
        rfm_weighted,
        order_stats_weighted,
        category_pivot_weighted,
        timing_features_weighted
    ], axis=1).fillna(0)
    
    return customer_features

def find_balanced_lookalikes(customer_features, target_customers, k=3):
    # Find lookalike customers using FAISS
    
    # Scale features
    scaler = MinMaxScaler()
    normalized_features = scaler.fit_transform(customer_features.values.astype('float32'))
    normalized_features = np.ascontiguousarray(normalized_features)
    
    # Build FAISS index
    dimension = normalized_features.shape[1]
    faiss.normalize_L2(normalized_features)
    index = faiss.IndexFlatIP(dimension)
    index.add(normalized_features)
    
    customer_ids = customer_features.index.tolist()
    results = {}
    
    for target_id in target_customers:
            target_idx = customer_ids.index(target_id)
            query_vector = np.ascontiguousarray(normalized_features[target_idx:target_idx+1])
            
            # find similar ones
            D, I = index.search(query_vector, k + 1)
            
            # results
            recommendations = []
            for sim, idx in zip(D[0], I[0]):
                if customer_ids[idx] != target_id:
                    # Transform similarity score
                    similarity_score = 1 / (1 + np.exp(-5 * (sim - 0.5)))
                    recommendations.append(f"{customer_ids[idx]}:{similarity_score:.3f}")
                if len(recommendations) == k:
                    break
                    
            results[target_id] = recommendations
            
        
            
    return results

customer_features = create_balanced_features(customers_df, transactions_df, products_df)
#  target customers 1 to 20
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
# Find lookalikes
print("finding lookalike customers")
results = find_balanced_lookalikes(customer_features, target_customers)
# Create output DataFrame
final_df = pd.DataFrame({
        'cust_id': list(results.keys()),
        'lookalikes': [','.join(v) for v in results.values()]
    })
    
# save to CSV
output_filename = 'FAISS_FirstName_LastName_Lookalike.csv'
final_df.to_csv(output_filename, index=False)
print(f"\nResults saved to {output_filename}")
    

print("\nLookalike Results for Customers C0001-C0020:")
print(final_df.to_string())

finding lookalike customers

Results saved to FAISS_FirstName_LastName_Lookalike.csv

Lookalike Results for Customers C0001-C0020:
   cust_id                           lookalikes
0    C0001  C0069:0.911,C0183:0.899,C0023:0.895
1    C0002  C0159:0.904,C0178:0.903,C0134:0.900
2    C0003  C0007:0.900,C0166:0.885,C0005:0.884
3    C0004  C0075:0.921,C0122:0.920,C0028:0.917
4    C0005  C0031:0.899,C0166:0.897,C0085:0.886
5    C0006  C0135:0.911,C0079:0.909,C0196:0.905
6    C0007  C0003:0.900,C0166:0.893,C0085:0.891
7    C0008  C0098:0.916,C0162:0.914,C0175:0.913
8    C0009  C0032:0.858,C0198:0.833,C0088:0.829
9    C0010  C0111:0.909,C0132:0.908,C0030:0.905
10   C0011  C0056:0.916,C0188:0.914,C0099:0.913
11   C0012  C0113:0.916,C0162:0.914,C0195:0.914
12   C0013  C0046:0.917,C0053:0.917,C0016:0.915
13   C0014  C0060:0.889,C0080:0.886,C0110:0.878
14   C0015  C0071:0.889,C0052:0.853,C0025:0.843
15   C0016  C0013:0.915,C0064:0.912,C0053:0.911
16   C0017  C0156:0.912,C0101:0.912,C0113:0.912
17   

In [2]:
# now the classical approach
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
 # Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
similarity_matrix = 1 - cdist(scaled_features, scaled_features, metric='cosine')
# Find similar customers using cosine similarity"""
# Create DataFrame for easier lookup
similarity_df = pd.DataFrame(
        similarity_matrix,
        index=customer_features.index,
        columns=customer_features.index
    )    
results = {}
for target_id in target_customers:
        # Get similarities for target customer
        customer_similarities = similarity_df[target_id].sort_values(ascending=False)
        similar_customers = customer_similarities[customer_similarities.index != target_id][:5] # no.of recommendations is now set to 5
        recommendations = [f"{idx}:{score:.3f}" for idx, score in similar_customers.items()]
        results[target_id] = recommendations
# output DataFrame
final_df = pd.DataFrame({
        'cust_id': list(results.keys()),
        'lookalikes': [','.join(v) for v in results.values()]
    })
    
# save results
output_filename = 'Approach_Classic_FirstName_LastName_Lookalike.csv'
final_df.to_csv(output_filename, index=False)
print(f"\nResults saved to {output_filename}")
# Display results
print("\nLookalike Results for Customers C0001-C0020:")
print(final_df.to_string())


Results saved to Approach_Classic_FirstName_LastName_Lookalike.csv

Lookalike Results for Customers C0001-C0020:
   cust_id                                                   lookalikes
0    C0001  C0005:0.744,C0069:0.676,C0020:0.656,C0091:0.560,C0181:0.506
1    C0002  C0106:0.641,C0060:0.629,C0151:0.617,C0077:0.585,C0086:0.570
2    C0003  C0144:0.752,C0007:0.646,C0091:0.608,C0005:0.586,C0045:0.544
3    C0004  C0075:0.887,C0122:0.813,C0028:0.770,C0065:0.727,C0162:0.700
4    C0005  C0001:0.744,C0128:0.646,C0094:0.633,C0031:0.627,C0199:0.618
5    C0006  C0135:0.708,C0079:0.700,C0170:0.613,C0149:0.586,C0129:0.574
6    C0007  C0166:0.676,C0003:0.646,C0112:0.622,C0120:0.609,C0005:0.585
7    C0008  C0098:0.851,C0162:0.726,C0175:0.679,C0145:0.656,C0194:0.648
8    C0009  C0058:0.762,C0032:0.751,C0150:0.709,C0097:0.694,C0119:0.675
9    C0010  C0132:0.774,C0027:0.771,C0111:0.764,C0198:0.734,C0030:0.704
10   C0011  C0099:0.815,C0188:0.754,C0101:0.710,C0165:0.620,C0056:0.613
11   C0012  C0113:0.80

### cdist calculates pairwise distances between all customers
### metric='cosine' uses cosine distance
### 1 - cdist converts distance to similarity (higher means more similar)
### For each target customer:

Gets their similarity scores with all other customers
Sorts by similarity (highest first)
Removes the customer themselves
Takes top 5 most similar
Formats as "CustomerID:SimilarityScore"
