In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


data = {
    'CustomerID': [1, 2, 3, 4, 5],
    'TotalValue': [500, 700, 300, 400, 600],
    'Quantity': [50, 70, 30, 40, 60]
}
transactions = pd.DataFrame(data)

# Save the simulated data (optional, for reference)
transactions.to_csv('../data/merged_dataset.csv', index=False)

# Load the dataset (if using an existing one, adjust the file path)
merged = transactions  # Use this line if loading directly: pd.read_csv('../data/merged_dataset.csv')

# Step 2: Aggregate Customer Transaction Data
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# Step 3: Standardize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Step 4: Compute Similarities
similarity_matrix = cosine_similarity(scaled_features)

# Step 5: Find Top 3 Lookalike Customers for Each Customer
similar_customers = {}
for idx, customer in enumerate(customer_features['CustomerID']):
    scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score, exclude the customer itself
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
    similar_customers[customer] = [
        (customer_features['CustomerID'][s[0]], s[1]) for s in scores
    ]

# Step 6: Save Results in a DataFrame
lookalike_df = pd.DataFrame([
    (k, v[0][0], v[0][1], v[1][0], v[1][1], v[2][0], v[2][1]) 
    for k, v in similar_customers.items()
], columns=['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])

# Save the lookalike results to a CSV file
lookalike_df.to_csv('../outputs/Lookalike.csv', index=False)

# Step 7: Print Output for Validation
print("Lookalike Model Results:")
print(lookalike_df)


Lookalike Model Results:
   CustomerID  Lookalike1  Score1  Lookalike2  Score2  Lookalike3  Score3
0           1           2     0.0           3     0.0           4     0.0
1           2           5     1.0           1     0.0           3    -1.0
2           3           4     1.0           1     0.0           2    -1.0
3           4           4     1.0           1     0.0           2    -1.0
4           5           5     1.0           1     0.0           3    -1.0
