In [14]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [15]:
# Load datasets
customers_df = pd.read_csv(r"./dataset/Customers.csv")
products_df = pd.read_csv(r"./dataset/Products.csv")
transactions_df = pd.read_csv(r"./dataset/Transactions.csv")

In [16]:
# Convert dates to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

In [17]:
# Feature Engineering
# Total spend per customer
customer_spend = transactions_df.groupby('CustomerID')['TotalValue'].sum()

In [18]:
# Total transactions per customer
customer_transactions = transactions_df.groupby('CustomerID')['TransactionID'].count()

In [19]:
# Average transaction value
avg_transaction_value = customer_spend / customer_transactions

# Most purchased product categories
transactions_products = transactions_df.merge(products_df, on='ProductID')
category_pref = transactions_products.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack().fillna(0)

In [20]:
# Combine features into a single DataFrame
features = pd.DataFrame({
    'TotalSpend': customer_spend,
    'TotalTransactions': customer_transactions,
    'AvgTransactionValue': avg_transaction_value
}).fillna(0)

In [21]:
features = features.join(category_pref)

# Add customer profile features (e.g., Region)
region_encoded = pd.get_dummies(customers_df.set_index('CustomerID')['Region'], prefix='Region')
features = features.join(region_encoded)

In [22]:
# Normalize features
scaler = StandardScaler()
features_normalized = pd.DataFrame(scaler.fit_transform(features), index=features.index, columns=features.columns)

# Compute similarity matrix
similarity_matrix = cosine_similarity(features_normalized)

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=features.index, columns=features.index)

In [23]:
# Generate Lookalike Recommendations
lookalike_map = {}
for cust_id in customers_df['CustomerID'][:20]:
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]
    lookalike_map[cust_id] = list(zip(similar_customers.index, similar_customers.values))


In [24]:
# Save to Lookalike.csv
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_map.keys()),
    'Lookalikes': [str(lookalike_map[cust_id]) for cust_id in lookalike_map.keys()]
})
lookalike_df.to_csv('Saksham_Garg_Lookalike.csv', index=False)