In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
from warnings import filterwarnings
filterwarnings("ignore")

In [13]:
data_customers = pd.read_csv(r'C:\Users\vinay\Ecommerce-transactions\Customers.csv')
data_products = pd.read_csv(r'C:\Users\vinay\Ecommerce-transactions\Products.csv')
data_transactions = pd.read_csv(r'C:\Users\vinay\Ecommerce-transactions\Transactions.csv')

In [14]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

In [15]:
transaction_features = data_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'TransactionID': 'count'
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'Quantity': 'TotalQuantity',
    'TransactionID': 'TotalTransactions'
}).reset_index()

customer_profiles = data_customers.merge(transaction_features, on='CustomerID', how='left')
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)
customer_profiles.fillna(0, inplace=True)

In [16]:
features = customer_profiles.drop(columns=['CustomerID', 'SignupDate', 'CustomerName'])

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

X_train, X_test, y_train, y_test = train_test_split(features_scaled, customer_profiles['CustomerID'], test_size=0.2, random_state=42)

knn = NearestNeighbors(n_neighbors=4, metric='euclidean')
knn.fit(X_train)

In [22]:
def find_similar_customers(customer_id, n_neighbors=3):
    try:
        customer_idx = customer_profiles[customer_profiles['CustomerID'] == customer_id].index[0]
    except IndexError:
        return []
    customer_vector = features_scaled[customer_idx].reshape(1, -1)
    distances, indices = knn.kneighbors(customer_vector, n_neighbors=n_neighbors + 1)
    similar_customers = []
    for idx, dist in zip(indices[0][1:], distances[0][1:]):
        similar_customer_id = customer_profiles.iloc[idx]['CustomerID']
        similar_customers.append((similar_customer_id, round(1 - dist, 2)))
    return similar_customers

In [24]:
print("Feature matrix statistics:")
print(pd.DataFrame(features_scaled).describe())


Feature matrix statistics:
                  0             1             2             3             4  \
count  2.000000e+02  2.000000e+02  2.000000e+02  2.000000e+02  2.000000e+02   
mean   1.687539e-16 -9.103829e-17 -8.881784e-18 -4.440892e-17 -8.881784e-17   
std    1.002509e+00  1.002509e+00  1.002509e+00  1.002509e+00  1.002509e+00   
min   -1.875173e+00 -2.050620e+00 -2.256468e+00 -5.773503e-01 -5.465357e-01   
25%   -7.133518e-01 -7.573634e-01 -9.025874e-01 -5.773503e-01 -5.465357e-01   
50%   -1.698851e-01 -1.107351e-01  0.000000e+00 -5.773503e-01 -5.465357e-01   
75%    7.144371e-01  6.975503e-01  4.512937e-01  0.000000e+00 -5.465357e-01   
max    3.926416e+00  3.122407e+00  2.707762e+00  1.732051e+00  1.829707e+00   

                  5  
count  2.000000e+02  
mean   7.105427e-17  
std    1.002509e+00  
min   -6.468692e-01  
25%   -6.468692e-01  
50%   -6.468692e-01  
75%    1.545908e+00  
max    1.545908e+00  


In [25]:
test_customer_id = y_test.iloc[0]
test_customer_vector = X_test[0]
similar_customers = find_similar_customers_from_vector(test_customer_vector)
print(f"Predicted similar customers for {test_customer_id}: {similar_customers}")


Predicted similar customers for C0096: [('C0033', 0.32), ('C0024', 0.04), ('C0123', 0.04)]


In [26]:
lookalike_map = {}

for idx, customer_id in enumerate(customer_profiles['CustomerID'][:20]):
    similar_customers = find_similar_customers(customer_id, n_neighbors=3)
    lookalike_map[customer_id] = similar_customers

lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_map.keys()),
    'Top3Lookalikes': [lookalike_map[cust_id] for cust_id in lookalike_map]
})

lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike file for the first 20 customers has been saved as 'Lookalike.csv'.")

Lookalike file for the first 20 customers has been saved as 'Lookalike.csv'.


In [27]:
def find_similar_customers_for_user(customer_id, n_neighbors=3):
    try:
        customer_idx = customer_profiles[customer_profiles['CustomerID'] == customer_id].index[0]
    except IndexError:
        print(f"Error: Customer ID {customer_id} not found in the dataset.")
        return []
    
    customer_vector = features_scaled[customer_idx].reshape(1, -1)
    distances, indices = knn.kneighbors(customer_vector, n_neighbors=n_neighbors + 1)
    similar_customers = []
    for idx, dist in zip(indices[0][1:], distances[0][1:]): 
        similar_customer_id = customer_profiles.iloc[idx]['CustomerID']
        similar_customers.append((similar_customer_id, round(1 - dist, 2))) 
    return similar_customers


while True:
    print("\n--- Lookalike Model ---")
    user_input_customer_id = input("Enter Customer ID to find similar customers: ")
    similar_customers = find_similar_customers_for_user(user_input_customer_id)
    if similar_customers:
        print(f"Top 3 similar customers for {user_input_customer_id}: {similar_customers}")
    else:
        print(f"No similar customers found for Customer ID: {user_input_customer_id}")
    
    # Ask the user whether to continue or stop
    continue_choice = input("Do you want to search for another customer? (yes/no): ").strip().lower()
    if continue_choice == 'no':
        print("👍🏻")
        break
    elif continue_choice != 'yes':
        print("Invalid input. Exiting the program.")
        break



--- Lookalike Model ---


Enter Customer ID to find similar customers:  C0005


Top 3 similar customers for C0005: [('C0103', 0.85), ('C0058', 0.66), ('C0027', 0.48)]


Do you want to search for another customer? (yes/no):  no


👍🏻


In [28]:
def evaluate_model(X_test, y_test, n_neighbors=3):
    total_precision = 0
    for i, customer_vector in enumerate(X_test):
        predicted_similar = find_similar_customers_from_vector(customer_vector, n_neighbors)
        predicted_ids = [x[0] for x in predicted_similar]
        if y_test.iloc[i] in predicted_ids:
            total_precision += 1
    precision_at_k = total_precision / len(X_test)
    return precision_at_k


In [29]:
print("Feature matrix statistics:")
print(pd.DataFrame(features_scaled).describe())

Feature matrix statistics:
                  0             1             2             3             4  \
count  2.000000e+02  2.000000e+02  2.000000e+02  2.000000e+02  2.000000e+02   
mean   1.687539e-16 -9.103829e-17 -8.881784e-18 -4.440892e-17 -8.881784e-17   
std    1.002509e+00  1.002509e+00  1.002509e+00  1.002509e+00  1.002509e+00   
min   -1.875173e+00 -2.050620e+00 -2.256468e+00 -5.773503e-01 -5.465357e-01   
25%   -7.133518e-01 -7.573634e-01 -9.025874e-01 -5.773503e-01 -5.465357e-01   
50%   -1.698851e-01 -1.107351e-01  0.000000e+00 -5.773503e-01 -5.465357e-01   
75%    7.144371e-01  6.975503e-01  4.512937e-01  0.000000e+00 -5.465357e-01   
max    3.926416e+00  3.122407e+00  2.707762e+00  1.732051e+00  1.829707e+00   

                  5  
count  2.000000e+02  
mean   7.105427e-17  
std    1.002509e+00  
min   -6.468692e-01  
25%   -6.468692e-01  
50%   -6.468692e-01  
75%    1.545908e+00  
max    1.545908e+00  
