<a href="https://colab.research.google.com/github/RChandana/ZeotapAssignment/blob/main/Lookalike_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing


In [1]:
import pandas as pd

In [9]:
# Load the data
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [20]:
# Check for missing values or duplicates
print(customers_df.isnull().sum())
print(products_df.isnull().sum())
print(transactions_df.isnull().sum())

CustomerID         0
CustomerName       0
Region             0
SignupDate         0
DaysSinceSignup    0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [21]:
# Remove duplicates if any
customers_df.drop_duplicates(inplace=True)
products_df.drop_duplicates(inplace=True)
transactions_df.drop_duplicates(inplace=True)

In [22]:
# Merge transactions with customer and product data
merged_df = pd.merge(transactions_df, customers_df, on="CustomerID", how="left")
merged_df = pd.merge(merged_df, products_df, on="ProductID", how="left")

# Feature Engineering

In [23]:
# Convert SignupDate to days since signup
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['DaysSinceSignup'] = (pd.to_datetime('today') - customers_df['SignupDate']).dt.days

# Merge 'DaysSinceSignup' back into the merged_df
merged_df = pd.merge(merged_df, customers_df[['CustomerID', 'DaysSinceSignup']], on="CustomerID", how="left")

In [24]:
# Aggregate transaction data by customer
customer_transactions = merged_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
).reset_index()

# Handle Missing Values

In [29]:
# Check for missing values in the customer profile
print(customer_transactions.isnull().sum())

# Fill missing values in the customer profile with zeros (alternative: fill with mean, median, etc.)
customer_transactions_filled = customer_transactions.fillna(0)

# Merge with customer profile for full data (DaysSinceSignup)
customer_profile = pd.merge(customers_df[['CustomerID', 'DaysSinceSignup']], customer_transactions_filled, on="CustomerID", how="left")


CustomerID           0
total_spent          0
transaction_count    0
unique_products      0
dtype: int64


# Compute Cosine Similarity

In [31]:
# Compute the cosine similarity between customers
cosine_sim = cosine_similarity(normalized_profile)

# Convert similarity matrix to a DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, columns=customer_profile_filled['CustomerID'], index=customer_profile_filled['CustomerID'])

# Preview the similarity matrix
print(cosine_sim_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.842496 -0.703567  0.373613  0.569021 -0.623930   
C0002       0.842496  1.000000 -0.222777 -0.151507  0.893859 -0.482070   
C0003      -0.703567 -0.222777  1.000000 -0.913036  0.177002  0.627609   
C0004       0.373613 -0.151507 -0.913036  1.000000 -0.550140 -0.561915   
C0005       0.569021  0.893859  0.177002 -0.550140  1.000000 -0.067286   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.656720 -0.130057 -0.033476  0.525564  ... -0.935250  0.707604   
C0002       0.897878 -0.503516  0.509486  0.898783  ... -0.682563  0.976436   
C0003       0.043814 -0.520601  0.717476  0.200394  ...  0.752813 -0.007392   
C0004      -0.441523  0.807513 -0.887509 -0.509644  ... -0.425705 -0.355521   
C0005  

# Lookalike Recomendations

In [32]:
# Function to get top N similar customers for each customer
def get_top_n_similar(customers, cosine_sim_matrix, top_n=3):
    recommendations = {}
    for cust_id in customers:
        # Getting similarity scores for the customer with other customers
        sim_scores = cosine_sim_matrix[cust_id].sort_values(ascending=False)
        # Removing self from the list
        sim_scores = sim_scores[sim_scores.index != cust_id]
        # Getting top N similar customers and their similarity scores
        recommendations[cust_id] = sim_scores.head(top_n)
    return recommendations

In [33]:
top_lookalikes = get_top_n_similar(customer_profile_filled['CustomerID'], cosine_sim_df, top_n=3)

# Converting the recommendations to a DataFrame for saving to CSV
lookalike_list = []
for cust_id, lookalikes in top_lookalikes.items():
    for similar_cust_id, score in zip(lookalikes.index, lookalikes):
        lookalike_list.append([cust_id, similar_cust_id, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

In [34]:
# Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

# Result
print(lookalike_df.head())

  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0152         0.999752
1      C0001               C0160         0.988904
2      C0001               C0134         0.977309
3      C0002               C0029         0.997114
4      C0002               C0166         0.989326
