<a href="https://colab.research.google.com/github/Rishiii5455/Zeotap-Intership-Assignment/blob/main/Riyanshu_Rai_lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Convert dates to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Check for missing values and duplicates
print("Missing values in Customers:\n", customers.isnull().sum())
print("Missing values in Products:\n", products.isnull().sum())
print("Missing values in Transactions:\n", transactions.isnull().sum())

# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Check the columns in merged_data
print("Columns in merged_data:", merged_data.columns)

# Prepare customer features
# Check if 'Price' exists in merged_data
if 'Price' in merged_data.columns:
    customer_features = merged_data.groupby('CustomerID').agg({
        'TotalValue': 'sum',
        'Quantity': 'sum',
        'Price': 'mean'  # This line will only work if 'Price' exists
    }).reset_index()
else:
    print("Price column does not exist. Adjusting feature engineering.")
    customer_features = merged_data.groupby('CustomerID').agg({
        'TotalValue': 'sum',
        'Quantity': 'sum'
        # Exclude 'Price' if it doesn't exist
    }).reset_index()

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])  # Exclude 'CustomerID'

# Compute cosine similarity
similarities = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarities, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Get top 3 similar customers for the first 20
lookalikes = {}
for cust_id in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df.loc[cust_id].sort_values(ascending=False)[1:4]  # Exclude self
    lookalikes[cust_id] = similar_customers.items()

# Save results
lookalikes_df = pd.DataFrame(lookalikes)
lookalikes_df.to_csv('Riyanshu_Rai_Lookalike.csv', index=False)
print("Lookalike results saved to 'Riyanshu_Rai_Lookalike.csv'")


Missing values in Customers:
 CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
Missing values in Products:
 ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
Missing values in Transactions:
 TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64
Columns in merged_data: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
Price column does not exist. Adjusting feature engineering.
Lookalike results saved to 'Riyanshu_Rai_Lookalike.csv'
