In [1]:
# Import Libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("../data/Customers.csv")
products = pd.read_csv("../data/Products.csv")
transactions = pd.read_csv("../data/Transactions.csv")

# Merge datasets
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Inspect the merged data columns
print("Merged Data Columns:", data.columns)

# If 'Price' is renamed to something like 'Price_x' or 'Price_y', use the correct column name
# Here, let's assume 'Price_y' is the correct 'Price' column
data['TotalTransactionValue'] = data['Quantity'] * data['Price_y']  # Use the correct 'Price_y' column

# Feature engineering for customers
customer_features = data.groupby('CustomerID').agg({
    'TotalTransactionValue': 'sum',
    'Quantity': 'sum',
    'Price_y': 'mean'  # Update this to 'Price_x' or 'Price_y' after inspecting merged data
}).reset_index()

# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Compute similarity
similarity_matrix = cosine_similarity(normalized_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Get top-3 similar customers for the first 20 customers
lookalikes = {}
for customer in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]
    lookalikes[customer] = list(zip(similar_customers.index, similar_customers.values))

# Export to CSV
lookalike_df = pd.DataFrame([(key, val[0], val[1]) for key, lst in lookalikes.items() for val in lst],
                            columns=["CustomerID", "SimilarCustomerID", "SimilarityScore"])
lookalike_df.to_csv("../outputs/Rudrakumar_Patel_Lookalike.csv", index=False)


Merged Data Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
