In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load customer and transaction data
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Feature Engineering
customer_features = customers.copy()
customer_features['SignupDuration'] = (pd.Timestamp.now() - pd.to_datetime(customer_features['SignupDate'])).dt.days

customer_features = customer_features.merge(
    transactions.groupby('CustomerID').agg(
        TotalTransactions=('TransactionID', 'count'),
        TotalSpent=('TotalValue', 'sum')
    ), on='CustomerID', how='left'
).fillna(0)  # Fill missing transaction data with 0

# Encode and Scale
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Region']),
        ('num', StandardScaler(), ['SignupDuration', 'TotalTransactions', 'TotalSpent'])
    ])
X = preprocessor.fit_transform(customer_features)

# Fit NearestNeighbors model
model = NearestNeighbors(n_neighbors=4, metric='cosine')
model.fit(X)

# Generate lookalikes for first 20 customers
lookalikes = {}
for cust_id in customer_features['CustomerID'].iloc[:20]:
    idx = customer_features[customer_features['CustomerID'] == cust_id].index[0]
    distances, indices = model.kneighbors(X[idx].reshape(1, -1))  # Fix: Reshape input to 2D
    similar_customers = customer_features.iloc[indices[0][1:4], 0]  # Exclude self, take top 3
    scores = (1 - distances[0][1:4]).round(3)  # Convert distance to similarity
    lookalikes[cust_id] = list(zip(similar_customers, scores))

# Save to CSV
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'Lookalikes': lookalikes.values()
})
lookalike_df.to_csv('Lookalike.csv', index=False)
