In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np


In [None]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')


In [None]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['YearsSinceSignup'] = (pd.to_datetime('today') - customers['SignupDate']).dt.days / 365

In [None]:
encoder = OneHotEncoder(sparse_output=False)
region_encoded = encoder.fit_transform(customers[['Region']])
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))

In [None]:
customer_features = pd.concat([customers[['CustomerID', 'YearsSinceSignup']], region_df], axis=1)

In [None]:
transaction_data = transactions.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    NumTransactions=('TransactionID', 'count'),
    AvgTransactionValue=('TotalValue', 'mean')
).reset_index()

In [None]:
customer_data = pd.merge(customer_features, transaction_data, on='CustomerID')

category_data = pd.get_dummies(transactions['ProductID'].map(products.set_index('ProductID')['Category']))


In [None]:
category_data['CustomerID'] = transactions['CustomerID']
category_data = category_data.groupby('CustomerID').agg(lambda x: (x.astype(bool).any())).reset_index()  # Change this line

full_data = pd.merge(customer_data, category_data, on='CustomerID')

scaler = StandardScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(full_data.drop('CustomerID', axis=1)), columns=full_data.columns[1:])
cosine_sim = cosine_similarity(normalized_data)

In [45]:
def get_top_lookalikes(customer_id, top_n=3):

    if customer_id not in full_data['CustomerID'].values:
        print(f"Warning: Customer ID {customer_id} not found in data.")
        return []

    customer_idx = full_data[full_data['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(cosine_sim[customer_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    top_lookalikes = [(full_data.iloc[i[0]]['CustomerID'], i[1]) for i in similarity_scores[1:top_n+1]]  # Exclude the customer itself
    return top_lookalikes

In [46]:
lookalikes = {}
for customer_id in ['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008', 'C0009', 'C0010',
                    'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020']:
    lookalikes[customer_id] = get_top_lookalikes(customer_id)

lookalikes_data = []
for customer_id, lookalike_data in lookalikes.items():
    if lookalike_data:
        for lookalike_id, similarity_score in lookalike_data:
            lookalikes_data.append([customer_id, lookalike_id, similarity_score])
    else:
        lookalikes_data.append([customer_id, np.nan, np.nan])

lookalikes_df = pd.DataFrame(lookalikes_data, columns=['CustomerID', 'Lookalike_CustomerID', 'Similarity_Score'])
lookalikes_df.to_csv('Lookalike.csv', index=False)  # Set index=False to avoid writing the index to the CSV

print(lookalikes['C0001'])

[('C0152', 0.9968236860714906), ('C0174', 0.9780163559440436), ('C0004', 0.8332140770852455)]
