# Task 2: Lookalike Model

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets

In [2]:
customers = pd.read_csv('D:\Datasets\Customers.csv')

In [3]:
products = pd.read_csv('D:\Datasets\Products.csv')

In [4]:
transactions = pd.read_csv('D:\Datasets\Transactions.csv')

# Merge datasets

In [5]:
data = pd.merge(transactions, customers, on='CustomerID', how='left')

In [6]:
data = pd.merge(data, products, on='ProductID', how='left')

# ------------------------- Feature Engineering -------------------------

# Aggregate transaction features per customer

In [7]:
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  
    'Quantity': 'sum',            
    'Category': lambda x: ','.join(x)  
}).reset_index()

# Rename columns

In [8]:
customer_features.columns = ['CustomerID', 'TotalSpending', 'AvgSpending', 'TotalQuantity', 'Categories']

# Encode product categories into features

In [9]:
category_encoder = OneHotEncoder()
categories_encoded = category_encoder.fit_transform(
    customer_features['Categories'].str.get_dummies(sep=',')
).toarray()

# Combine all features into a single DataFrame

In [10]:
features = pd.concat([
    customer_features[['CustomerID', 'TotalSpending', 'AvgSpending', 'TotalQuantity']],
    pd.DataFrame(categories_encoded, columns=category_encoder.get_feature_names_out())
], axis=1)

# Normalize numeric features

In [11]:
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features.iloc[:, 1:])
features_normalized = pd.DataFrame(features_scaled, columns=features.columns[1:], index=features['CustomerID'])

# ------------------------- Similarity Calculation -------------------------

# Compute pairwise cosine similarity

In [12]:
similarity_matrix = cosine_similarity(features_normalized)

# Create a DataFrame for similarity matrix

In [13]:
similarity_df = pd.DataFrame(similarity_matrix, index=features['CustomerID'], columns=features['CustomerID'])

# ------------------------- Recommendation for Top 3 Lookalikes -------------------------

In [14]:
lookalikes = {}

In [15]:
for customer_id in features['CustomerID'][:20]: 
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  
    lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Convert to a DataFrame for output

In [16]:
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalikes.keys()),
    'Lookalikes': [str(value) for value in lookalikes.values()]
})

In [17]:
# Save Lookalike.csv
lookalike_df.to_csv('Satyajeet_Pawar_Lookalike.csv', index=False)

# ------------------------- Output -------------------------

In [18]:
print("Lookalike recommendations for the first 20 customers saved to 'Satyajeet_Pawar_Lookalike.csv'")

Lookalike recommendations for the first 20 customers saved to 'Satyajeet_Pawar_Lookalike.csv'
