<a href="https://colab.research.google.com/github/Upanshujha/Zeotap/blob/main/Upanshu_jha_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [2]:
# Load data
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

In [3]:
# Preprocess transactions
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [4]:
# Merge datasets
merged = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [5]:
# Feature engineering
customer_features = merged.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    purchase_frequency=('TransactionID', 'count'),
    favorite_category=('Category', lambda x: x.mode()[0]),
    last_purchase=('TransactionDate', max)
).reset_index()

  customer_features = merged.groupby('CustomerID').agg(


In [6]:
# Add customer metadata
customer_features = customer_features.merge(customers, on='CustomerID')

In [7]:
# Calculate tenure and recency
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['tenure'] = (datetime.now() - customer_features['SignupDate']).dt.days
customer_features['recency'] = (datetime.now() - customer_features['last_purchase']).dt.days

In [8]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['total_spent', 'avg_transaction_value',
                                 'purchase_frequency', 'tenure', 'recency']),
        ('cat', OneHotEncoder(), ['favorite_category', 'Region'])
    ])

processed_features = preprocessor.fit_transform(
    customer_features[['total_spent', 'avg_transaction_value', 'purchase_frequency',
                     'favorite_category', 'Region', 'tenure', 'recency']]
)

In [9]:
# Calculate similarity matrix
similarity_matrix = cosine_similarity(processed_features)

In [10]:
# Generate lookalike mappings
lookalike_map = []
target_customers = [f"C00{i:02d}" for i in range(1, 21)]

for cust_id in target_customers:
    try:
        idx = customer_features.index[customer_features['CustomerID'] == cust_id].tolist()[0]
        scores = list(enumerate(similarity_matrix[idx]))
        # Exclude self and get top 3
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]

        # Create list of (cust_id, score) tuples
        lookalikes = [
            (customer_features.iloc[i]['CustomerID'], round(score, 4))
            for i, score in sorted_scores
        ]

        lookalike_map.append({
            'CustomerID': cust_id,
            'Lookalikes': lookalikes
        })
    except IndexError:
        print(f"Customer {cust_id} not found in features")
    except Exception as e:
        print(f"Error processing {cust_id}: {str(e)}")

In [11]:
# Convert to DataFrame and save
lookalike_df = pd.DataFrame(lookalike_map)

In [12]:
# Format the lookalikes column as required
lookalike_df['Lookalikes'] = lookalike_df['Lookalikes'].apply(
    lambda x: [f"{cust_id},{score}" for cust_id, score in x]
)

lookalike_df.to_csv("Lookalike.csv", index=False)