In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')
merged_df = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

# Feature Engineering
merged_df['SignupDuration'] = (pd.to_datetime('2025-01-01') - pd.to_datetime(merged_df['SignupDate'])).dt.days
customer_features = merged_df.groupby('CustomerID').agg({
    'Region': lambda x: x.iloc[0],  
    'SignupDuration': 'max',
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: len(set(x)),  
}).rename(columns={
    'TransactionID': 'TotalTransactions',
    'TotalValue': 'TotalSpending',
    'Category': 'ProductDiversity'
}).reset_index()

customer_features = pd.get_dummies(customer_features, columns=['Region'])

print("\nRaw Feature Vectors:")
print(customer_features)

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

scaled_features_df = pd.DataFrame(scaled_features, columns=customer_features.columns[1:])
scaled_features_df.insert(0, 'CustomerID', customer_features['CustomerID'])

print("\nScaled Feature Vectors:")
print(scaled_features_df)

similarity_matrix = cosine_similarity(scaled_features)

# Recommendations
customer_ids = customer_features['CustomerID'].values
lookalike_map = {}
for idx, cust_id in enumerate(customer_ids):
    similarities = list(enumerate(similarity_matrix[idx]))
    similarities = [(customer_ids[i], score) for i, score in similarities if customer_ids[i] != cust_id]
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
    lookalike_map[cust_id] = similarities

lookalike_df = pd.DataFrame([
    {'cust_id': cust, 'lookalikes': lookalikes} for cust, lookalikes in lookalike_map.items()
])
lookalike_df.to_csv('Sahil_Agrawal_Lookalike.csv', index=False)

print("\nLookalike Map:")
print(lookalike_df)



Raw Feature Vectors:
    CustomerID  SignupDuration  TotalSpending  TotalTransactions  \
0        C0001             906        3354.52                  5   
1        C0002            1053        1862.74                  4   
2        C0003             300        2725.38                  4   
3        C0004             815        5354.88                  8   
4        C0005             870        2034.24                  3   
..         ...             ...            ...                ...   
194      C0196             939        4982.88                  4   
195      C0197             652        1928.65                  3   
196      C0198            1039         931.83                  2   
197      C0199             760        1979.28                  4   
198      C0200             570        4758.60                  5   

     ProductDiversity  Region_Asia  Region_Europe  Region_North America  \
0                   3        False          False                 False   
1          