In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:

# Loading data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# merging dataset on "ProductID"
transactions_products = transactions.merge(products, on='ProductID')
data = transactions_products.merge(customers, on='CustomerID')

In [4]:
#Feature engg

customer_features = data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    product_diversity=('ProductID', 'nunique'),
    region=('Region', lambda x: x.iloc[0])  # Assuming 'Region' is the column name
).reset_index()


In [5]:
#Encode categorical features
customer_features = pd.get_dummies(customer_features, columns=['region'], drop_first=True)


In [6]:
# Standardise
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

In [7]:
similarity_matrix = cosine_similarity(scaled_features)


In [8]:
# Top 3 looklike for each customer 

lookalike_map = {}
for i, customer_id in enumerate(customer_features['CustomerID']):
    similarities = list(enumerate(similarity_matrix[i]))
    top_similar = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]
    lookalike_map[customer_id] = [(customer_features['CustomerID'][j], score) for j, score in top_similar]

In [9]:
#create a looklike.csv for C0001 to C0020
lookalike_list = []
for customer_id in customer_features['CustomerID']:
    if customer_id in [f'C{i:04d}' for i in range(1, 21)]:
        lookalike_list.append({
            'cust_id': customer_id,
            'lookalikes': lookalike_map[customer_id]
        })

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Lookalike.csv', index=False)

In [10]:
print(lookalike_df.head())

  cust_id                                         lookalikes
0   C0001  [(C0137, 0.9999196989131426), (C0152, 0.999836...
1   C0002  [(C0142, 0.976639238739811), (C0043, 0.9722585...
2   C0003  [(C0133, 0.9950073885620306), (C0052, 0.967021...
3   C0004  [(C0108, 0.9814012933872766), (C0113, 0.978655...
4   C0005  [(C0159, 0.9995139993894759), (C0123, 0.984492...
