In [1]:
# FirstName_LastName_Lookalike.ipynb

# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Load datasets from data/ directory
customers_df = pd.read_csv('../data/Customers.csv')
transactions_df = pd.read_csv('../data/Transactions.csv')

# Merge customers and transactions data
customer_transactions = pd.merge(transactions_df, customers_df, on='CustomerID')

# Aggregate transaction data for each customer
customer_summary = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'TransactionID': 'count'  # number of transactions
}).reset_index()

# Merge with customer data
customer_profile = pd.merge(customers_df, customer_summary, on='CustomerID')

# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(customer_profile[['TotalValue', 'Quantity']])

# Implement KNN for lookalike model
knn = NearestNeighbors(n_neighbors=3, metric='cosine')
knn.fit(X)

# Get lookalike for first 20 customers
distances, indices = knn.kneighbors(X[:20])

# Create Lookalike.csv and save it in the reports/ directory
lookalike_dict = {}
for i, customer_id in enumerate(customer_profile['CustomerID'][:20]):
    lookalike_dict[customer_id] = [(customer_profile['CustomerID'][j], 1 - distances[i][k]) for k, j in enumerate(indices[i])]

lookalike_df = pd.DataFrame(lookalike_dict.items(), columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('../reports/FirstName_LastName_Lookalike.csv', index=False)

print(lookalike_df.head())


  CustomerID                                         Lookalikes
0      C0001  [(C0001, 1.0), (C0085, 0.9999990504724361), (C...
1      C0002  [(C0002, 1.0), (C0157, 0.9999942410168485), (C...
2      C0003  [(C0003, 0.9999999999999999), (C0111, 0.994008...
3      C0004  [(C0004, 1.0), (C0162, 0.9999999965087093), (C...
4      C0005  [(C0005, 0.9999999999999999), (C0080, 0.999982...
