In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load Data

In [4]:
path="/content/drive/MyDrive/Assignment/"
transactions = pd.read_csv(path+'Transactions.csv')
products= pd.read_csv(path+'Products.csv')
customers = pd.read_csv(path+'Customers.csv')

# Feature Engineering

In [5]:
# Merge transactions with products
trans_prod = pd.merge(transactions, products, on='ProductID')
trans_prod.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [23]:
# Analysis of Customer purchase patterns
customer_features = trans_prod.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean'],
        'TransactionID': 'count'
    }).reset_index()
customer_features.head()

Unnamed: 0_level_0,CustomerID,TotalValue,TotalValue,Quantity,Quantity,TransactionID
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean,count
0,C0001,3354.52,670.904,12,2.4,5
1,C0002,1862.74,465.685,10,2.5,4
2,C0003,2725.38,681.345,14,3.5,4
3,C0004,5354.88,669.36,23,2.875,8
4,C0005,2034.24,678.08,7,2.333333,3


In [24]:
# Category preferences
category_pivot = pd.pivot_table(
        trans_prod,
        index='CustomerID',
        columns='Category',
        values='TotalValue',
        aggfunc='sum',
        fill_value=0
    )

category_pivot.head()

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,114.6,0.0,2827.3,412.62
C0002,0.0,1025.46,0.0,837.28
C0003,0.0,122.36,1385.2,1217.82
C0004,1888.48,0.0,1355.74,2110.66
C0005,0.0,0.0,1180.38,853.86


In [25]:
# Merge all features

# Reset index levels to single level before merging
customer_features.columns = ['_'.join(col).strip('_') for col in customer_features.columns.values]
customer_features = pd.merge(
        customer_features,
        category_pivot,
        on='CustomerID'
        )

customer_features.head()

Unnamed: 0,CustomerID,TotalValue_sum,TotalValue_mean,Quantity_sum,Quantity_mean,TransactionID_count,Books,Clothing,Electronics,Home Decor
0,C0001,3354.52,670.904,12,2.4,5,114.6,0.0,2827.3,412.62
1,C0002,1862.74,465.685,10,2.5,4,0.0,1025.46,0.0,837.28
2,C0003,2725.38,681.345,14,3.5,4,0.0,122.36,1385.2,1217.82
3,C0004,5354.88,669.36,23,2.875,8,1888.48,0.0,1355.74,2110.66
4,C0005,2034.24,678.08,7,2.333333,3,0.0,0.0,1180.38,853.86


In [26]:
# Add region encoding
region_dummies = pd.get_dummies(customers['Region'], prefix='Region')

customer_features = pd.merge(
        customer_features,
        pd.concat([customers['CustomerID'], region_dummies], axis=1),
        on='CustomerID'
    )
customer_features.head()

Unnamed: 0,CustomerID,TotalValue_sum,TotalValue_mean,Quantity_sum,Quantity_mean,TransactionID_count,Books,Clothing,Electronics,Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,670.904,12,2.4,5,114.6,0.0,2827.3,412.62,False,False,False,True
1,C0002,1862.74,465.685,10,2.5,4,0.0,1025.46,0.0,837.28,True,False,False,False
2,C0003,2725.38,681.345,14,3.5,4,0.0,122.36,1385.2,1217.82,False,False,False,True
3,C0004,5354.88,669.36,23,2.875,8,1888.48,0.0,1355.74,2110.66,False,False,False,True
4,C0005,2034.24,678.08,7,2.333333,3,0.0,0.0,1180.38,853.86,True,False,False,False


# Similarity Calculation

In [27]:
def calculate_similarity(customer_features):
    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(features_scaled)

    return similarity_matrix, customer_features['CustomerID'].values

#  Get Top Lookalikes

In [28]:
def get_top_lookalikes(customer_id, similarity_matrix, customer_ids, n=3):
    customer_idx = np.where(customer_ids == customer_id)[0][0]
    customer_similarities = similarity_matrix[customer_idx]

    # Get top N similar customers (excluding self)
    similar_indices = customer_similarities.argsort()[::-1][1:n+1]
    similar_scores = customer_similarities[similar_indices]

    return [(customer_ids[idx], score) for idx, score in zip(similar_indices, similar_scores)]

# Output Of Lookalikes


In [37]:
def save_lookalikes():
    # Calculate similarity
    similarity_matrix, customer_ids = calculate_similarity(customer_features)
    # Generate results for first 20 customers
    results = {}
    for i in range(20):
        customer_id = f'C{i+1:04d}'
        lookalikes = get_top_lookalikes(customer_id, similarity_matrix, customer_ids)
        results[customer_id] = [(cust_id, float(score)) for cust_id, score in lookalikes]

    # Save results to CSV
    output_data = []
    for cust_id, lookalikes in results.items():
        for rank, (similar_cust, score) in enumerate(lookalikes, 1):
            output_data.append({
                'CustomerID': cust_id,
                'SimilarCustomerID': similar_cust,
                'SimilarityScore': round(score, 4),
                'Rank': rank
            })

    pd.DataFrame(output_data).to_csv('Lookalike.csv', index=False)

In [38]:
save_lookalikes()

In [39]:
look_alike_df = pd.read_csv('Lookalike.csv')
look_alike_df.head()

Unnamed: 0,CustomerID,SimilarCustomerID,SimilarityScore,Rank
0,C0001,C0181,0.8807,1
1,C0001,C0120,0.8416,2
2,C0001,C0192,0.776,3
3,C0002,C0159,0.9188,1
4,C0002,C0106,0.8999,2
