# Ecommerce Transactions Lookalike Model 

## Aim :
- Build a Lookalike Model that takes a user's information as input and recommends 3 similarcustomers based on their profile and transaction history. The model should:
1. Use both customer and product information.
2. Assign a similarity score to each recommended customer.

### Data Description
#### 1.Customers.csv
- CustomerID: Unique identifier for each customer.
- CustomerName: Name of the customer.
- Region: Continent where the customer resides.
- SignupDate: Date when the customer signed up.
#### 2. Products.csv
- ProductID: Unique identifier for each product.
- ProductName: Name of the product.
- Category: Product category.
- Price: Product price in USD.
#### 3. Transactions.csv
- TransactionID: Unique identifier for each transaction.
- CustomerID: ID of the customer who made the transaction.
- ProductID: ID of the product sold.
- TransactionDate: Date of the transaction.
- Quantity: Quantity of the product purchased.
- TotalValue: Total value of the transaction.
- Price: Price of the product sold.

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#read the Customer data
customer_df=pd.read_csv("Customers.csv")

In [3]:
customer_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
#read the Products Data
product_df=pd.read_csv("Products.csv")

In [5]:
product_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [6]:
# Read the Transaction Data
transaction_df=pd.read_csv("Transactions.csv")

In [7]:
transaction_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [8]:
# Merge the all dataset with customer ID and Product ID
merged_all_df = pd.merge(transaction_df, customer_df, on='CustomerID', how='inner')
merged_all_df = pd.merge(merged_all_df, product_df, on='ProductID', how='inner')

In [9]:
merged_all_df.shape

(1000, 13)

In [10]:
merged_all_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [12]:
# Extract the some features to build lookalike model
# Aggregate transaction data per customer to calculate:
# total_spend: Total spending by the customer.
# transaction_count: Number of transactions by the customer.
# avg_transaction_value: Average value per transaction.
# most_purchased_category: The category the customer buys most often.
# Add customer demographics (Region, SignupYear).

In [13]:
customer_features = merged_all_df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    most_purchased_category=('Category', lambda x: x.value_counts().idxmax())
).reset_index()

In [14]:
customer_features.head()

Unnamed: 0,CustomerID,total_spend,transaction_count,avg_transaction_value,most_purchased_category
0,C0001,3354.52,5,670.904,Electronics
1,C0002,1862.74,4,465.685,Home Decor
2,C0003,2725.38,4,681.345,Home Decor
3,C0004,5354.88,8,669.36,Home Decor
4,C0005,2034.24,3,678.08,Electronics


In [16]:
#Extract the customer details from customer data
customer_details = customer_df[['CustomerID', 'Region','SignupDate']]

In [17]:
customer_details.head()

Unnamed: 0,CustomerID,Region,SignupDate
0,C0001,South America,2022-07-10
1,C0002,Asia,2022-02-13
2,C0003,South America,2024-03-07
3,C0004,South America,2022-10-09
4,C0005,Asia,2022-08-15


In [19]:
#Extract the sigup year from signup date
customer_details['SigupYear']=customer_details['SignupDate'].str.split("-").str[0]

In [20]:
customer_details.head()

Unnamed: 0,CustomerID,Region,SignupDate,SigupYear
0,C0001,South America,2022-07-10,2022
1,C0002,Asia,2022-02-13,2022
2,C0003,South America,2024-03-07,2024
3,C0004,South America,2022-10-09,2022
4,C0005,Asia,2022-08-15,2022


In [22]:
# merge the customer features and customer details
customer_features = pd.merge(customer_features, customer_details[['CustomerID', 'Region', 'SigupYear']], on='CustomerID', how='inner')

In [23]:
customer_features.head()

Unnamed: 0,CustomerID,total_spend,transaction_count,avg_transaction_value,most_purchased_category,Region,SigupYear
0,C0001,3354.52,5,670.904,Electronics,South America,2022
1,C0002,1862.74,4,465.685,Home Decor,Asia,2022
2,C0003,2725.38,4,681.345,Home Decor,South America,2024
3,C0004,5354.88,8,669.36,Home Decor,South America,2022
4,C0005,2034.24,3,678.08,Electronics,Asia,2022


In [24]:
# one hot encoding for region and category features
encoded_features = pd.get_dummies(customer_features[['Region', 'most_purchased_category']], prefix=['Region', 'Category']).astype(int)

In [25]:
encoded_features.head()

Unnamed: 0,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,0,0,0,1,0,0,1,0
1,1,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,1
3,0,0,0,1,0,0,0,1
4,1,0,0,0,0,0,1,0


In [26]:
# concatenate the dataframe
final_features = pd.concat([customer_features[['CustomerID', 'total_spend', 'transaction_count', 'avg_transaction_value', 'SigupYear']], encoded_features], axis=1)

In [27]:
final_features

Unnamed: 0,CustomerID,total_spend,transaction_count,avg_transaction_value,SigupYear,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,5,670.904000,2022,0,0,0,1,0,0,1,0
1,C0002,1862.74,4,465.685000,2022,1,0,0,0,0,0,0,1
2,C0003,2725.38,4,681.345000,2024,0,0,0,1,0,0,0,1
3,C0004,5354.88,8,669.360000,2022,0,0,0,1,0,0,0,1
4,C0005,2034.24,3,678.080000,2022,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,4,1245.720000,2022,0,1,0,0,0,0,0,1
195,C0197,1928.65,3,642.883333,2023,0,1,0,0,0,0,1,0
196,C0198,931.83,2,465.915000,2022,0,1,0,0,0,1,0,0
197,C0199,1979.28,4,494.820000,2022,0,1,0,0,0,0,1,0


In [29]:
# dataframe without signup year
final_features_without_year=final_features.drop("SigupYear",axis=1)

In [30]:
final_features_without_year.head()

Unnamed: 0,CustomerID,total_spend,transaction_count,avg_transaction_value,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,5,670.904,0,0,0,1,0,0,1,0
1,C0002,1862.74,4,465.685,1,0,0,0,0,0,0,1
2,C0003,2725.38,4,681.345,0,0,0,1,0,0,0,1
3,C0004,5354.88,8,669.36,0,0,0,1,0,0,0,1
4,C0005,2034.24,3,678.08,1,0,0,0,0,0,1,0


In [39]:

def recommendations(customer_id, feature_matrix, top_n):
    
    # Standardize
    scaler = StandardScaler()
    
    # numerical features for standardizing
    numerical_features = ['total_spend', 'transaction_count', 'avg_transaction_value', 'SigupYear']
    
    # Create a new dataframe
    feature_matrix_scaled = feature_matrix.copy()
    
    # Standardize numerical features
    feature_matrix_scaled[numerical_features] = scaler.fit_transform(feature_matrix[numerical_features])
    
    # Drop CustomerID before calculating similarity
    feature_data = feature_matrix_scaled.drop('CustomerID', axis=1)
    
    
    # Extract input customer details as a vector
    input_customer_vector = feature_data[feature_matrix['CustomerID'] == customer_id].values.reshape(1, -1)
    
    # All other customer details as vectors for similarity calculation
    all_customer_vectors = feature_data.values
    
    # Calculate cosine similarity
    similarity_scores = cosine_similarity(input_customer_vector, all_customer_vectors).flatten()
    
    # Add similarity scores to the data
    feature_matrix['Similarity_score'] = similarity_scores
    
    #Sort by similarity score except input customer 
    recommendations = feature_matrix[feature_matrix['CustomerID'] != customer_id].sort_values(by='Similarity_score', ascending=False)
    
    # Return the top_n similar customers
    return recommendations.head(top_n)


In [40]:
recommendations("C0001",final_features,5)

Unnamed: 0,CustomerID,total_spend,transaction_count,avg_transaction_value,SigupYear,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Similarity_score
117,C0118,3434.77,6,572.461667,2022,0,0,0,1,0,0,1,0,0.951762
190,C0192,2072.72,4,518.18,2022,0,0,0,1,0,0,1,0,0.880162
111,C0112,1959.51,3,653.17,2022,0,0,0,1,0,0,1,0,0.852013
182,C0184,3393.18,7,484.74,2022,0,0,0,1,0,0,1,0,0.848439
151,C0152,3385.86,5,677.172,2022,0,0,0,1,0,0,0,1,0.724274


In [56]:

def get_customer_recommendations(target_customer_id, customer_features, top_n_recommendations):
    
    # Standardization
    standardizer = StandardScaler()
    
    # numerical features for standardization
    num_columns = ['total_spend', 'transaction_count', 'avg_transaction_value']
    
    scaled_features = customer_features.copy()
    
    # Standardize numerical features
    scaled_features[num_columns] = standardizer.fit_transform(customer_features[num_columns])
    
    # Drop CustomerID before calculating similarity
    df = scaled_features.drop('CustomerID', axis=1)
    
    
    # Extract target customer details as a vector
    target_customer_vector = df[scaled_features['CustomerID'] == target_customer_id].values.reshape(1, -1)
    
    # All other customer details as vectors for similarity calculation
    all_customer_vectors = df.values
    
    # Calculate cosine similarity
    similarity_scores = cosine_similarity(target_customer_vector, all_customer_vectors).flatten()
    
    # Add similarity scores to the dataframe
    scaled_features['Similarity_score'] = similarity_scores
    
    # Exclude the target customer from recommendations and sort by similarity score
    recommendations_df = scaled_features[scaled_features['CustomerID'] != target_customer_id].sort_values(by='Similarity_score', ascending=False)
    
    # Return the top_n similar customers
    return recommendations_df.head(top_n_recommendations)



In [57]:
get_customer_recommendations("C0001",final_features_without_year ,5)

Unnamed: 0,CustomerID,total_spend,transaction_count,avg_transaction_value,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Similarity_score
188,C0190,-0.264922,-0.011458,-0.383319,0,0,0,1,0,0,1,0,0.968215
47,C0048,0.209853,-0.011458,0.34806,0,0,0,1,0,0,1,0,0.941072
117,C0118,-0.017803,0.444578,-0.48504,0,0,0,1,0,0,1,0,0.918219
38,C0039,0.42246,0.444578,0.080138,0,0,0,1,0,0,1,0,0.897366
90,C0091,-0.180329,0.444578,-0.693681,0,0,0,1,0,0,1,0,0.880428


In [68]:

def top_recommendations(target_customer_id, customer_features, top_n):
    

    standardizer = StandardScaler()
    

    num_columns = ['total_spend', 'transaction_count', 'avg_transaction_value']
    

    scaled_features = customer_features.copy()
    

    scaled_features[num_columns] = standardizer.fit_transform(customer_features[num_columns])
    

    df = scaled_features.drop('CustomerID', axis=1)
    

    target_customer_vector = df[scaled_features['CustomerID'] == target_customer_id].values.reshape(1, -1)
    

    all_customer_vectors = df.values
    

    similarity_scores = cosine_similarity(target_customer_vector, all_customer_vectors).flatten()
    

    scaled_features['Similarity_score'] = similarity_scores
    

    recommendations_df = scaled_features[scaled_features['CustomerID'] != target_customer_id].sort_values(by='Similarity_score', ascending=False)
    

    dic={}
    for i in range(top_n):
        cust_id=recommendations_df['CustomerID'].iloc[i]
        score=recommendations_df['Similarity_score'].iloc[i]
        dic[cust_id]=score
    return dic


In [69]:
top_recommendations("C0001",final_features_without_year,3)

{'C0190': 0.968215451295126,
 'C0048': 0.9410720811249147,
 'C0118': 0.9182186575953006}

In [74]:
#top 3 lookalikes with there similarity scores for the first 20 customers (CustomerID: C0001 - C0020)
lookalike_dictionary={}
for i in range(20):
    cust_id=final_features_without_year['CustomerID'].iloc[i]
    lookalike_dictionary[cust_id]=top_recommendations(cust_id,final_features_without_year,3)


In [75]:
lookalike_dictionary

{'C0001': {'C0190': 0.968215451295126,
  'C0048': 0.9410720811249147,
  'C0118': 0.9182186575953006},
 'C0002': {'C0128': 0.9405449688340127,
  'C0097': 0.8988071235864499,
  'C0106': 0.897417999902512},
 'C0003': {'C0133': 0.9925083336710878,
  'C0052': 0.9847977904024425,
  'C0152': 0.9262643125643457},
 'C0004': {'C0113': 0.9828597350955222,
  'C0012': 0.9711039819467209,
  'C0104': 0.9431261033586318},
 'C0005': {'C0186': 0.9787905419345102,
  'C0146': 0.9598508342465527,
  'C0007': 0.9047531872453538},
 'C0006': {'C0168': 0.9732537429499297,
  'C0171': 0.9513382554181676,
  'C0187': 0.9447449444739173},
 'C0007': {'C0115': 0.934222671257929,
  'C0005': 0.904753187245354,
  'C0146': 0.8431738734340793},
 'C0008': {'C0194': 0.9134849904802553,
  'C0024': 0.9122619900683164,
  'C0109': 0.8700104206236067},
 'C0009': {'C0010': 0.9760669630706748,
  'C0198': 0.9520351377564205,
  'C0062': 0.930816960231575},
 'C0010': {'C0009': 0.9760669630706748,
  'C0111': 0.9708504756032534,
  'C006

In [76]:
#convert the dictionary into dataframe with customer id and lookalikes
lookalike_data = []
for cust_id, lookalikes in lookalike_dictionary.items():
    lookalike_data.append({
        "cust_id": cust_id,
        "Lookalikes": str(lookalikes) 
    })

lookalike_df = pd.DataFrame(lookalike_data)

In [77]:
lookalike_df.iloc[0]

cust_id                                                   C0001
Lookalikes    {'C0190': 0.968215451295126, 'C0048': 0.941072...
Name: 0, dtype: object

In [78]:
# save the lookalike into csv file
lookalike_df.to_csv('PrakashNaidu_Talatam_Lookalike.csv', index=False)