<a href="https://colab.research.google.com/github/Pradeep24032004/Data-Science-Assignment-eCommerce-Transactions-Dataset/blob/main/Maggala_Pradeep_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [3]:
# Preprocessing Customers dataset
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupYear'] = customers['SignupDate'].dt.year

# Encode the 'Region' column
label_encoder = LabelEncoder()
customers['RegionEncoded'] = label_encoder.fit_transform(customers['Region'])

In [4]:
# Merge Transactions with Products
txns_with_products = transactions.merge(products, on='ProductID', how='left')

# Alternative approach: Calculate AvgPrice based on TotalValue / Quantity
txns_with_products['CalculatedPrice'] = txns_with_products['TotalValue'] / txns_with_products['Quantity']
txns_with_products['CalculatedPrice'].fillna(0, inplace=True)  # Handle division by zero or missing data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  txns_with_products['CalculatedPrice'].fillna(0, inplace=True)  # Handle division by zero or missing data


In [5]:
# Aggregate transaction data per customer
txn_summary = txns_with_products.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'CalculatedPrice': 'mean',  # Use calculated price instead of missing Price
    'Category': lambda x: x.mode()[0] if not x.mode().empty else None
}).rename(columns={
    'TransactionID': 'TransactionCount',
    'CalculatedPrice': 'AvgPrice'
}).reset_index()


In [6]:
# Encode categorical columns
txn_summary['CategoryEncoded'] = label_encoder.fit_transform(txn_summary['Category'].astype(str))

# Merge Customers and Transaction Summary
customer_data = pd.merge(customers, txn_summary, on='CustomerID', how='left')
customer_data.fillna(0, inplace=True)


In [7]:
# Feature engineering for similarity
def feature_engineering(df):
    features = ['RegionEncoded', 'SignupYear', 'TransactionCount', 'Quantity', 'TotalValue', 'AvgPrice', 'CategoryEncoded']
    return df[features]

features = feature_engineering(customer_data)

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Compute similarity matrix
similarity_matrix = cosine_similarity(features_scaled)

In [8]:
# Function to find top N similar customers
def find_top_similar(customers_df, global_customers, global_similarity_matrix, top_n=3):
    customer_ids = customers_df['CustomerID'].values
    global_ids = global_customers['CustomerID'].values

    # Map indices from subset to global similarity matrix
    subset_indices = [np.where(global_ids == cust_id)[0][0] for cust_id in customer_ids]
    subset_similarity_matrix = global_similarity_matrix[subset_indices, :][:, subset_indices]

    lookalike_map = {}
    for idx, cust_id in enumerate(customer_ids):
        sim_scores = list(enumerate(subset_similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_similar = [(customer_ids[i], score) for i, score in sim_scores[1:top_n + 1]]
        lookalike_map[cust_id] = top_similar

    return lookalike_map


In [9]:
# Get top 3 similar customers for the first 20 customers
customer_subset = customers[customers['CustomerID'].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]
lookalike_map = find_top_similar(customer_subset, customers, similarity_matrix)

In [10]:
# Save Lookalike map to CSV
lookalike_list = []
for cust_id, similars in lookalike_map.items():
    for similar_cust_id, score in similars:
        lookalike_list.append({"CustomerID": cust_id, "SimilarCustomerID": similar_cust_id, "Score": score})

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv("Lookalike.csv", index=False)

In [11]:
# Print top 3 similar customers for the first 20 customers
print(lookalike_df)

   CustomerID SimilarCustomerID     Score
0       C0001             C0011  0.617869
1       C0001             C0004  0.361737
2       C0001             C0003  0.192198
3       C0002             C0010  0.868055
4       C0002             C0005  0.788503
5       C0002             C0007  0.593362
6       C0003             C0012  0.618847
7       C0003             C0014  0.553283
8       C0003             C0008  0.421682
9       C0004             C0017  0.733299
10      C0004             C0011  0.665745
11      C0004             C0013  0.506975
12      C0005             C0007  0.941816
13      C0005             C0002  0.788503
14      C0005             C0009  0.687967
15      C0006             C0016  0.735793
16      C0006             C0018  0.656978
17      C0006             C0013  0.630326
18      C0007             C0005  0.941816
19      C0007             C0009  0.607235
20      C0007             C0002  0.593362
21      C0008             C0012  0.833466
22      C0008             C0013  0