In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Feature Engineering
customer_features = customers[['Region']]
transaction_summary = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean'
}).reset_index()

# Merge customer data with transaction summary
merged_data = pd.merge(customers, transaction_summary, on='CustomerID', how='left')

# Check for NaN or infinite values and handle them
# Replace infinities with NaN
merged_data.replace([float('inf'), -float('inf')], float('nan'), inplace=True)

# Drop rows with NaN values or fill them
merged_data = merged_data.fillna(0)  # or merged_data.dropna() to drop rows

# Normalize numerical features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(merged_data[['TotalValue', 'Quantity', 'Price']])

# Compute Cosine Similarity
cosine_sim = cosine_similarity(normalized_features)

# Get Top 3 Lookalikes for the first 20 customers
lookalikes = {}
for i in range(20):
    similarity_scores = list(enumerate(cosine_sim[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Ensure we're only capturing the customer ID and similarity score
    top_3_similar_customers = [(merged_data.iloc[x[0]]['CustomerID'], x[1]) for x in similarity_scores[1:4]]
    
    # Save the lookalike data with CustomerID as the key
    lookalikes[merged_data.iloc[i]['CustomerID']] = top_3_similar_customers

# Flatten the lookalike data into a list of tuples for each customer
flattened_lookalikes = []
for customer_id, lookalike_list in lookalikes.items():
    for lookalike in lookalike_list:
        flattened_lookalikes.append([customer_id, lookalike[0], lookalike[1]])

# Create DataFrame for lookalikes
lookalike_df = pd.DataFrame(flattened_lookalikes, columns=['CustomerID', 'Lookalike CustomerID', 'Similarity Score'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)
