In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [5]:
customers = pd.read_csv(r"C:\Users\VIVINA MUTH\Desktop\Customers.csv")
products = pd.read_csv(r"C:\Users\VIVINA MUTH\Desktop\Products.csv")
trending_products = pd.read_csv(r"C:\Users\VIVINA MUTH\Desktop\trending_products.csv")

In [13]:
# --- Preprocessing ---
# Aggregate transaction data
transaction_features = trending_products.groupby('ProductID').agg(
    TotalSpend=('ReviewCount', 'sum'),
    TotalQuantity=('Brand', 'sum'),
    NumTransactions=('ImageURL', 'count')
).reset_index()

In [18]:
customer_ids = customers['CustomerID'].values
transaction_dict = transaction_features.set_index('ProductID').to_dict(orient='index')

In [19]:
# Initialize empty columns for transaction data
customers['TotalSpend'] = 0
customers['TotalQuantity'] = 0
customers['NumTransactions'] = 0

In [20]:
# Manually map aggregated transaction data to customer data
for idx, customer_id in enumerate(customer_ids):
    if customer_id in transaction_dict:
        customers.loc[idx, 'TotalSpend'] = transaction_dict[customer_id]['TotalSpend']
        customers.loc[idx, 'TotalQuantity'] = transaction_dict[customer_id]['TotalQuantity']
        customers.loc[idx, 'NumTransactions'] = transaction_dict[customer_id]['NumTransactions']


In [22]:
# Import LabelEncoder from sklearn
from sklearn.preprocessing import LabelEncoder

# Encode categorical features (e.g., Region)
encoder = LabelEncoder()
customers['RegionEncoded'] = encoder.fit_transform(customers['Region'])

In [23]:
# Select features for similarity calculation
features = ['TotalSpend', 'TotalQuantity', 'NumTransactions', 'RegionEncoded']
X = customers[features]

In [24]:
# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [26]:
# --- Similarity Calculation ---
# Compute cosine similarity
similarity_matrix = cosine_similarity(X_scaled)

In [27]:
# Create a DataFrame for similarity
similarity_df = pd.DataFrame(similarity_matrix, index=customers['CustomerID'], columns=customers['CustomerID'])

In [28]:
# --- Find Top 3 Lookalikes ---
lookalikes = {}

for customer_id in customers['CustomerID'][:20]:  # First 20 customers (C0001 - C0020)
    # Get similarity scores for the current customer
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Top 3 excluding self
    lookalikes[customer_id] = [(sim_id, score) for sim_id, score in similar_customers.items()]

# Convert lookalikes dictionary to a DataFrame
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'Lookalikes': [str(lookalikes[customer]) for customer in lookalikes.keys()]
})

In [29]:
# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike.csv has been created!")

Lookalike.csv has been created!
