#### importing necessaries libraries

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

#### loading the datasets

In [4]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

#### featuring engineering
#### merging customer and transaction data to create customer profiles

In [6]:
transactions = transactions.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

#### aggregate the transaction data by customer

In [8]:
customer_profile = transactions.groupby('CustomerID').agg(
    total_value=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    num_products=('ProductID', 'nunique'),
    avg_spending=('TotalValue', 'mean'),
    most_common_category=('Category', lambda x: x.mode()[0]),
).reset_index()

#### adding demographic features like region and signup date
#### convert signup date to datetime

In [9]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['signup_year'] = customers['SignupDate'].dt.year
customers['signup_month'] = customers['SignupDate'].dt.month

####  merging the demographic features with the transaction based features

In [10]:
customer_profile = customer_profile.merge(customers[['CustomerID', 'Region', 'signup_year', 'signup_month']], on='CustomerID', how='left')

#### standardize numeric features for similarity calculaion

In [13]:
scaler = StandardScaler()

In [15]:
numeric_features=['total_value', 'num_transactions', 'num_products', 'avg_spending']
customer_profile[numeric_features] = scaler.fit_transform(customer_profile[numeric_features])

#### convert categorical features into one hot encoding

In [16]:
customer_profile = pd.get_dummies(customer_profile, columns=['Region'], drop_first=True)

#### preparing the data for similarity calculation

In [17]:
feature_columns = numeric_features + [col for col in customer_profile.columns if col.startswith('Region')]
X = customer_profile[feature_columns]

#### compute the cosine similarity between all customers

In [21]:
cosine_sim = cosine_similarity(X)

#### functions to get the top 3 most similar customers for a given customer

In [40]:
def get_top_3_similar_customer(customer_id, sim_matrix, customer_ids):
    idx = customer_ids.index(customer_id)
    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]
    top_3 = [(customer_ids[i[0]], i[1]) for i in sim_scores]
    return top_3

In [38]:
customer_ids = customer_profile['CustomerID'].tolist()
top_3_customers = {}

In [39]:
for customer_id in customer_ids[:20]:
    top_3_customers[customer_id] = get_top_3_similar_customer(customer_id, cosine_sim, customer_ids)

#### preparing the output in the required format

In [42]:
lookalike_data = []
for customer_id, similar_customers in top_3_customers.items():
    for sim_customer, score in similar_customers:
        lookalike_data.append([customer_id, sim_customer, score])
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

#### saving the output to a csv file

In [43]:
lookalike_df.to_csv('lookalike.csv', index=False)

In [44]:
print(lookalike_df.head())

  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0137         0.999762
1      C0001               C0152         0.999512
2      C0001               C0107         0.964257
3      C0002               C0043         0.987504
4      C0002               C0142         0.977492
