# Build the Lookalike Model

In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

## 1. Data Preparation:
- Aggregate Transaction Data to derive new features
- Merge this data products dataset also

In [12]:
#loading the data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [13]:
final_data = transactions.merge(customers , on="CustomerID").merge(products , on='ProductID')

In [14]:
final_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


<h3 style="color:green; font-weight:bolder; text-align:center;">Extracting the required features like total spent value ,unique products bought , average product price , numbers of transactions</h3>

In [15]:
customer_features = final_data.groupby('CustomerID').agg(
    total_spend=("TotalValue" ,'sum'),
    unique_products = ("ProductID" , "nunique"),
    avg_product_price = ("Price_x" , "mean"),
    transaction_count = ("TransactionID" ,'count')
).reset_index()

In [16]:
customer_features.head()

Unnamed: 0,CustomerID,total_spend,unique_products,avg_product_price,transaction_count
0,C0001,3354.52,5,278.334,5
1,C0002,1862.74,4,208.92,4
2,C0003,2725.38,4,195.7075,4
3,C0004,5354.88,8,240.63625,8
4,C0005,2034.24,3,291.603333,3


<h3 style="color:green; font-weight:bolder; text-align:center;">Merging Region with customer features to get the similarity between customers from a specific region</h1>

In [17]:
customer_features= customer_features.merge(customers[["CustomerID" , "Region"]] , on="CustomerID")

In [18]:
customer_features.head()

Unnamed: 0,CustomerID,total_spend,unique_products,avg_product_price,transaction_count,Region
0,C0001,3354.52,5,278.334,5,South America
1,C0002,1862.74,4,208.92,4,Asia
2,C0003,2725.38,4,195.7075,4,South America
3,C0004,5354.88,8,240.63625,8,South America
4,C0005,2034.24,3,291.603333,3,Asia


<h3 style="color:green; font-weight:bolder; text-align:center;">Data Scaling</h3>

In [19]:
customer_features = pd.get_dummies(customer_features , columns=['Region'] , drop_first=True)

In [20]:
scaler = StandardScaler()
numerical_cols = ["total_spend" , "unique_products" , "avg_product_price" , "transaction_count"]
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

<h3 style="color:green; font-weight:bolder; text-align:center;"> Computing Similarities </h3>

In [21]:
similarity_matrix =cosine_similarity(customer_features.drop('CustomerID' , axis=1))
similarity_df = pd.DataFrame(similarity_matrix , index=customer_features['CustomerID'] , columns=customer_features["CustomerID"])

### Function to get top 3 similar customers

In [22]:
def get_top_similar_three(customers ,similarity_data):
    lookalikes = {}
    for customer_id in customers:
        similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
        lookalikes[customer_id] = list(zip(similar_customers.index , similar_customers.values))
    return lookalikes


### Getting IDS of first 20 customers and using the function for finding similar ones

In [23]:
customer_ids = customer_features['CustomerID'].head(20)

In [28]:
lookalikes = get_top_similar_three(customer_ids , similarity_df)

In [29]:
lookalikes_df = pd.DataFrame({"cust_id" : lookalikes.keys() , "lookalikes" : lookalikes.values()})

In [30]:
lookalikes_df.head()

Unnamed: 0,cust_id,lookalikes
0,C0001,"[(C0137, 0.9896858063304705), (C0152, 0.986758..."
1,C0002,"[(C0043, 0.9846244763893289), (C0142, 0.977531..."
2,C0003,"[(C0025, 0.9568228412937883), (C0071, 0.935543..."
3,C0004,"[(C0108, 0.9840450015081225), (C0113, 0.971927..."
4,C0005,"[(C0128, 0.9948531908772384), (C0123, 0.992854..."
