# Task 2: Lookalike Model

Building a Lookalike Model that takes a user's information as input and recommends 3 similar
customers based on their **profile** and **transaction history**

### Data Pre-processing

Importing all the dependencies

In [1]:
#Computing
import pandas as pd
import numpy as np
import seaborn as sns

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

Reading all the given .csv file using pandas `.read_csv` function

In [2]:
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

### Feature Engineering

Lets summerize how these features were extracted:

- **`MostBoughtCategory`** and **`SecondMostBoughtCategory`** : First Group the data by `CustomerID` and `Category`, summing the Quantity for each category.Sort the categories in descending order of quantity for each customer, and then *pick the most frequently purchased* category for each customer.

- **`MostBoughtProduct`** and **`SecondMostBoughtProduct`** : Group the data by CustomerID and ProductName, summing the Quantity for each product,
then Sort the products in descending order of quantity for each customer,
and then Pick the most frequently purchased product for each customer.

- **`total_transactions`** : Aggregates the data by `CustomerID` to count the total number of transactions (`TransactionID`) for each customer.

- **`avg_transaction_value`** : Aggregates the data by `CustomerID` to calculate the average value (`TotalValue`) of their transactions.


In [3]:

transactions_products = pd.merge(transactions, products, on='ProductID', how='inner')
merged_data = pd.merge(transactions_products, customers, on='CustomerID', how='inner')

merged_data.dropna(inplace=True)

category_counts = merged_data.groupby(['CustomerID', 'Category'], as_index=False)['Quantity'].sum()
category_ranks = category_counts.sort_values(['CustomerID', 'Quantity'], ascending=[True, False])
most_bought_category = category_ranks.groupby('CustomerID').nth(0).reset_index()
second_most_bought_category = category_ranks.groupby('CustomerID').nth(1).reset_index()

product_counts = merged_data.groupby(['CustomerID', 'ProductName'], as_index=False)['Quantity'].sum()
product_ranks = product_counts.sort_values(['CustomerID', 'Quantity'], ascending=[True, False])
most_bought_product = product_ranks.groupby('CustomerID').nth(0).reset_index()
second_most_bought_product = product_ranks.groupby('CustomerID').nth(1).reset_index()

customer_transactions = merged_data.groupby('CustomerID', as_index=False).agg(
    total_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean')
)

most_bought_category = most_bought_category.rename(columns={'Quantity': 'MostBoughtCategoryQuantity'})
second_most_bought_category = second_most_bought_category.rename(columns={'Quantity': 'SecondMostBoughtCategoryQuantity'})
most_bought_product = most_bought_product.rename(columns={'Quantity': 'MostBoughtProductQuantity'})
second_most_bought_product = second_most_bought_product.rename(columns={'Quantity': 'SecondMostBoughtProductQuantity'})

most_bought_category = most_bought_category[['CustomerID', 'Category']].rename(columns={'Category': 'MostBoughtCategory'})
second_most_bought_category = second_most_bought_category[['CustomerID', 'Category']].rename(columns={'Category': 'SecondMostBoughtCategory'})
most_bought_product = most_bought_product[['CustomerID', 'ProductName']].rename(columns={'ProductName': 'MostBoughtProduct'})
second_most_bought_product = second_most_bought_product[['CustomerID', 'ProductName']].rename(columns={'ProductName': 'SecondMostBoughtProduct'})

final_df = customers[['CustomerID', 'Region', 'SignupDate']].copy()
final_df = final_df.merge(most_bought_category, on='CustomerID', how='left')
final_df = final_df.merge(second_most_bought_category, on='CustomerID', how='left')
final_df = final_df.merge(most_bought_product, on='CustomerID', how='left')
final_df = final_df.merge(second_most_bought_product, on='CustomerID', how='left')
final_df = final_df.merge(customer_transactions, on='CustomerID', how='left')


print(final_df.head())

  CustomerID         Region  SignupDate MostBoughtCategory  \
0      C0001  South America  2022-07-10        Electronics   
1      C0002           Asia  2022-02-13         Home Decor   
2      C0003  South America  2024-03-07         Home Decor   
3      C0004  South America  2022-10-09         Home Decor   
4      C0005           Asia  2022-08-15        Electronics   

  SecondMostBoughtCategory        MostBoughtProduct   SecondMostBoughtProduct  \
0               Home Decor       HomeSense Wall Art        TechPro Headphones   
1                 Clothing            BookWorld Rug    BookWorld Cookware Set   
2                 Clothing    ActiveWear Smartwatch        ActiveWear T-Shirt   
3                    Books    ActiveWear Smartwatch               TechPro Rug   
4               Home Decor  ActiveWear Cookware Set  ComfortLiving Headphones   

   total_transactions  avg_transaction_value  
0                 5.0                670.904  
1                 4.0                465.685  

Final Dataframe for making Lookalike Model

In [4]:
final_df

Unnamed: 0,CustomerID,Region,SignupDate,MostBoughtCategory,SecondMostBoughtCategory,MostBoughtProduct,SecondMostBoughtProduct,total_transactions,avg_transaction_value
0,C0001,South America,2022-07-10,Electronics,Home Decor,HomeSense Wall Art,TechPro Headphones,5.0,670.904000
1,C0002,Asia,2022-02-13,Home Decor,Clothing,BookWorld Rug,BookWorld Cookware Set,4.0,465.685000
2,C0003,South America,2024-03-07,Home Decor,Clothing,ActiveWear Smartwatch,ActiveWear T-Shirt,4.0,681.345000
3,C0004,South America,2022-10-09,Home Decor,Books,ActiveWear Smartwatch,TechPro Rug,8.0,669.360000
4,C0005,Asia,2022-08-15,Electronics,Home Decor,ActiveWear Cookware Set,ComfortLiving Headphones,3.0,678.080000
...,...,...,...,...,...,...,...,...,...
195,C0196,Europe,2022-06-07,Home Decor,Clothing,ActiveWear Rug,ActiveWear Jacket,4.0,1245.720000
196,C0197,Europe,2023-03-21,Electronics,Home Decor,BookWorld Smartwatch,ActiveWear Wall Art,3.0,642.883333
197,C0198,Europe,2022-02-27,Clothing,Electronics,HomeSense Running Shoes,ComfortLiving Laptop,2.0,465.915000
198,C0199,Europe,2022-12-03,Home Decor,Electronics,HomeSense Wall Art,ActiveWear Rug,4.0,494.820000


### Importance of Each Feature:

Feature Importance:

- `MostBoughtCategory` and `SecondMostBoughtCategory`:
These features are critical for understanding customer preferences at a category level. They help identify which product categories a customer engages with most frequently and their secondary interests.

- `MostBoughtProduct` and `SecondMostBoughtProduct`:
These features provide detailed information about specific product preferences. Knowing a customer's most purchased products enables precise recommendations and cross-selling opportunities.

- `total_transactions`:
This feature reflects the engagement level of a customer with the platform. Customers with a higher number of transactions are typically more active and engaged, making them suitable candidates for loyalty programs or premium services.

- `avg_transaction_value`:
This feature highlights the spending behavior of a customer. It helps distinguish between high-value customers who make infrequent but large purchases and budget-conscious customers who make smaller, frequent purchases.

### Calculating Similarity Index:

> The similarity score is calculated between two customers based on various behavioral, transactional, and demographic features. The algorithm adds points for matching attributes and penalizes differences in spending behavior. Below are the key points of evaluation:

1. **Region Match**:
   - **Criteria**: If both customers belong to the same region.
   - **Points Awarded**: **+10**

2. **Signup Date Proximity**:
   - **Criteria**: If the signup dates of the two customers are within 30 days of each other.
   - **Points Awarded**: **+5**

3. **Most Bought Category Match**:
   - **Criteria**: If the most purchased category of both customers is the same.
   - **Points Awarded**: **+15**

4. **Second Most Bought Category Match**:
   - **Criteria**: If the second most purchased category of both customers is the same.
   - **Points Awarded**: **+10**

5. **Cross Match of Categories**:
   - **Criteria**:
     - If Customer A’s most bought category matches Customer B’s second most bought category.
     - If Customer A’s second most bought category matches Customer B’s most bought category.
   - **Points Awarded**: **+10** (for each instance)

6. **Most Bought Product Match**:
   - **Criteria**: If the most purchased product of both customers is the same.
   - **Points Awarded**: **+20**

7. **Second Most Bought Product Match**:
   - **Criteria**: If the second most purchased product of both customers is the same.
   - **Points Awarded**: **+10**

8. **Cross Match of Products**:
   - **Criteria**:
     - If Customer A’s most bought product matches Customer B’s second most bought product.
     - If Customer A’s second most bought product matches Customer B’s most bought product.
   - **Points Awarded**: **+10** (for each instance)

9. **Total Transactions Match**:
   - **Criteria**: If both customers have the same number of total transactions.
   - **Points Awarded**: **+5**

10. **Average Transaction Value Proximity**:
    - **Criteria**:
      - If the absolute difference in average transaction value is **≤100**.
      - If the difference is **>100**, a penalty is applied based on the difference.
    - **Points Awarded**:
      - **+15** for a difference **≤100**.
      - For differences **>100**, points decrease proportionally: 15 - (`difference` / 100)
       
      - Rounded to 1 decimal place.





In [5]:
def calculate_similarity(customer_id_1, customer_id_2, df):

    customer_1 = df[df['CustomerID'] == customer_id_1].iloc[0]
    customer_2 = df[df['CustomerID'] == customer_id_2].iloc[0]

    score = 0

    if customer_1['Region'] == customer_2['Region']:
        score += 10

    signup_diff = abs(pd.to_datetime(customer_1['SignupDate']) - pd.to_datetime(customer_2['SignupDate'])).days
    if signup_diff <= 30:
        score += 5

    if customer_1['MostBoughtCategory'] == customer_2['MostBoughtCategory']:
        score += 15

    if customer_1['SecondMostBoughtCategory'] == customer_2['SecondMostBoughtCategory']:
        score += 10

    if customer_1['MostBoughtCategory'] == customer_2['SecondMostBoughtCategory']:
        score += 10
    if customer_1['SecondMostBoughtCategory'] == customer_2['MostBoughtCategory']:
        score += 10

    if customer_1['MostBoughtProduct'] == customer_2['MostBoughtProduct']:
        score += 20

    if customer_1['SecondMostBoughtProduct'] == customer_2['SecondMostBoughtProduct']:
        score += 10

    if customer_1['MostBoughtProduct'] == customer_2['SecondMostBoughtProduct']:
        score += 10
    if customer_1['SecondMostBoughtProduct'] == customer_2['MostBoughtProduct']:
        score += 10

    if customer_1['total_transactions'] == customer_2['total_transactions']:
        score += 5

    avg_transaction_diff = abs(customer_1['avg_transaction_value'] - customer_2['avg_transaction_value'])
    if avg_transaction_diff <= 100:
        score += 15
    else:
        minus = 15 - (avg_transaction_diff / 100)
        score += round(minus, 1)

    return score

 > **By using the above function now, completing the computing the similarity index for first 20 customer ids and then storing them lookalike.csv file, as mentioned in the assignment**





In [6]:

first_20_customers = final_df['CustomerID'].head(20).tolist()

lookalike_results = {}

for cust_id in first_20_customers:
    scores = []
    for other_id in final_df['CustomerID']:
        if cust_id != other_id:
            score = calculate_similarity(cust_id, other_id, final_df)
            scores.append((other_id, score))


    top_3 = sorted(scores, key=lambda x: x[1], reverse=True)[:3]
    lookalike_results[cust_id] = top_3

lookalike_map = {}

for cust_id, lookalikes in lookalike_results.items():
    lookalike_list = [{"cust_id": lookalike_id, "score": score} for lookalike_id, score in lookalikes]
    lookalike_map[cust_id] = lookalike_list


print(lookalike_map)

lookalike_map_df = pd.DataFrame({
    "CustomerID": list(lookalike_map.keys()),
    "LookalikeList": [str(lookalike_map[cust_id]) for cust_id in lookalike_map]
})

lookalike_map_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv has been created with the required Map format.")


{'C0001': [{'cust_id': 'C0113', 'score': 50}, {'cust_id': 'C0152', 'score': 50}, {'cust_id': 'C0085', 'score': 48.0}], 'C0002': [{'cust_id': 'C0178', 'score': 52.9}, {'cust_id': 'C0134', 'score': 50}, {'cust_id': 'C0159', 'score': 47.7}], 'C0003': [{'cust_id': 'C0004', 'score': 60}, {'cust_id': 'C0031', 'score': 58.2}, {'cust_id': 'C0133', 'score': 55}], 'C0004': [{'cust_id': 'C0003', 'score': 60}, {'cust_id': 'C0102', 'score': 55}, {'cust_id': 'C0108', 'score': 55}], 'C0005': [{'cust_id': 'C0055', 'score': 58.9}, {'cust_id': 'C0173', 'score': 55}, {'cust_id': 'C0007', 'score': 53.2}], 'C0006': [{'cust_id': 'C0094', 'score': 57.3}, {'cust_id': 'C0010', 'score': 53.7}, {'cust_id': 'C0013', 'score': 53.0}], 'C0007': [{'cust_id': 'C0005', 'score': 53.2}, {'cust_id': 'C0026', 'score': 50}, {'cust_id': 'C0045', 'score': 50}], 'C0008': [{'cust_id': 'C0144', 'score': 67.7}, {'cust_id': 'C0088', 'score': 50}, {'cust_id': 'C0134', 'score': 48.9}], 'C0009': [{'cust_id': 'C0156', 'score': 61.0}, 