<a href="https://colab.research.google.com/github/Nisjain120/Projects/blob/main/Nischay_Jain_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

customers_df = pd.read_csv('/content/Customers.csv')
products_df = pd.read_csv('/content/Products.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')

customer_transactions = transactions_df.merge(customers_df[['CustomerID', 'Region', 'CustomerName']], on='CustomerID', how='left')
customer_transactions = customer_transactions.merge(products_df[['ProductID', 'Category']], on='ProductID', how='left')

customer_summary = customer_transactions.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    regions=('Region', 'first'),
    customer_name=('CustomerName', 'first')
).reset_index()

category_summary = customer_transactions.groupby(['CustomerID', 'Category']).agg(
    total_quantity=('Quantity', 'sum')
).unstack().fillna(0)

category_summary.columns = [col[1] for col in category_summary.columns]

customer_profile = customer_summary.merge(category_summary, on='CustomerID', how='left')

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profile.drop(['CustomerID', 'regions', 'customer_name'], axis=1))

similarity_matrix = cosine_similarity(scaled_features)

lookalikes = defaultdict(list)

customer_ids_range = [f'C{i:04}' for i in range(1, 21)]

for idx, customer_id in enumerate(customer_profile['CustomerID']):
    if customer_id not in customer_ids_range:
        continue

    sim_scores = similarity_matrix[idx]

    sorted_similarities = sorted(enumerate(sim_scores), key=lambda x: x[1], reverse=True)

    top_3 = [(customer_profile['CustomerID'][i], score) for i, score in sorted_similarities[1:4]]
    lookalikes[customer_id] = top_3

lookalike_list = []
for customer_id, similar_customers in lookalikes.items():
    for similar_customer, score in similar_customers:
        lookalike_list.append([customer_id, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print(lookalike_df.head())


  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0069         0.933006
1      C0001               C0026         0.904200
2      C0001               C0157         0.855824
3      C0002               C0178         0.956485
4      C0002               C0133         0.949490
