# Lookalike Model

##  Task 2: 
Build a Lookalike Model that takes a user's information as input and recommends 3 similar customers based on their profile and transaction history
Todo:
- Load the data
- Explore the data
- Data Preprocessing and Data Cleaning
- Create Model

In [33]:
#importing the necessary libraries 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [34]:
#loading the data
customers = pd.read_csv('/kaggle/input/zeotapdata/Customers.csv')
products = pd.read_csv('/kaggle/input/zeotapdata/Products.csv')
transactions = pd.read_csv('/kaggle/input/zeotapdata/Transactions.csv')

In [35]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [36]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB


In [37]:
customers.describe()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
count,200,200,200,200
unique,200,200,4,179
top,C0001,Lawrence Carroll,South America,2024-11-11
freq,1,1,59,3


In [38]:
#Finding the null values if any
customer_missing_percentages = customers.isna().sum().sort_values(ascending= False) / len(customers)
customer_missing_percentages

CustomerID      0.0
CustomerName    0.0
Region          0.0
SignupDate      0.0
dtype: float64

In [39]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [40]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.2+ KB


In [41]:
#Finding the null values if any
product_missing_percentages = products.isna().sum().sort_values(ascending= False) / len(products)
product_missing_percentages

ProductID      0.0
ProductName    0.0
Category       0.0
Price          0.0
dtype: float64

In [42]:
products['ProductID'].unique().shape

(100,)

In [43]:
products.describe()

Unnamed: 0,Price
count,100.0
mean,267.5517
std,143.219383
min,16.08
25%,147.7675
50%,292.875
75%,397.09
max,497.76


In [44]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,8/25/24 12:38,1,300.68,300.68
1,T00112,C0146,P067,5/27/24 22:23,1,300.68,300.68
2,T00166,C0127,P067,4/25/24 7:38,1,300.68,300.68
3,T00272,C0087,P067,3/26/24 22:55,2,601.36,300.68
4,T00363,C0070,P067,3/21/24 15:10,3,902.04,300.68


In [45]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price            1000 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 54.8+ KB


In [46]:
#Finding the null values if any
transaction_missing_percentages = transactions.isna().sum().sort_values(ascending= False) / len(transactions)
transaction_missing_percentages

TransactionID      0.0
CustomerID         0.0
ProductID          0.0
TransactionDate    0.0
Quantity           0.0
TotalValue         0.0
Price              0.0
dtype: float64

In [47]:
transactions.describe()

Unnamed: 0,Quantity,TotalValue,Price
count,1000.0,1000.0,1000.0
mean,2.537,689.99556,272.55407
std,1.117981,493.144478,140.73639
min,1.0,16.08,16.08
25%,2.0,295.295,147.95
50%,3.0,588.88,299.93
75%,4.0,1011.66,404.4
max,4.0,1991.04,497.76


In [48]:
# Merge transaction data with product data
df = pd.merge(transactions, products, on='ProductID', how='left')


In [49]:
# Merge the result with customer data
df = pd.merge(df, customers, on='CustomerID', how='left')


In [50]:
# Creating customer-wise aggregated features
customer_features = df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    num_unique_products=('ProductID', 'nunique'),
    regions=('Region', 'first')  # to keep customer region
).reset_index()

In [51]:
# Encoding: OneHotEncode the 'Region' column
encoder = OneHotEncoder(sparse=False)

In [52]:
# Appling encoding to the 'regions' column
encoded_regions = encoder.fit_transform(customer_features[['regions']])
encoded_regions_df = pd.DataFrame(encoded_regions, columns=encoder.get_feature_names_out(['regions']))
customer_features = pd.concat([customer_features, encoded_regions_df], axis=1)
customer_features = customer_features.drop(columns=['regions'])



In [53]:
# Normalizing numerical features (total_spent, num_transactions, num_unique_products)
scaler = StandardScaler()
customer_features[['total_spent', 'num_transactions', 'num_unique_products']] = scaler.fit_transform(
    customer_features[['total_spent', 'num_transactions', 'num_unique_products']]
)

In [54]:
# Function to compute similarity based on customer profile
def recommend_similar_customers(customer_id, customer_features):
    # Extract the customer profile based on customer_id
    customer_profile = customer_features[customer_features['CustomerID'] == customer_id].drop('CustomerID', axis=1)

    # Compute cosine similarity between the target customer and all other customers
    similarities = cosine_similarity(customer_profile, customer_features.drop('CustomerID', axis=1))

    # Create a similarity score dataframe
    similarity_scores = pd.DataFrame(similarities.T, columns=['SimilarityScore'])
    similarity_scores['CustomerID'] = customer_features['CustomerID']
    
    # Sort by similarity score in descending order and exclude the target customer itself
    top_similar_customers = similarity_scores[similarity_scores['CustomerID'] != customer_id].sort_values(by='SimilarityScore', ascending=False).head(3)
    
    return top_similar_customers[['CustomerID', 'SimilarityScore']]

In [55]:
#Input customer profile for recommendation
customer_id = input("Enter the Customer id : ex: CXXXX")
top_similar_customers = recommend_similar_customers(customer_id, customer_features)

# Merging top_similar_customers with customer data to get the customer details
recommended_customers = pd.merge(top_similar_customers, customers, on='CustomerID', how='left')

# Display the recommended customers
print(recommended_customers[['CustomerID', 'CustomerName', 'Region', 'SignupDate', 'SimilarityScore']])


Enter the Customer id : ex: CXXXX C0001


  CustomerID    CustomerName         Region  SignupDate  SimilarityScore
0      C0137  Robert Gardner  South America  2024-04-09         0.999929
1      C0152    Justin Evans  South America  2022-04-19         0.999854
2      C0107   Dana Cantrell  South America  2023-02-07         0.989231
