In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [4]:

# Step 1: Load the customer and transaction data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Step 2: Feature Engineering - Combine customer profile with transaction history
# Merge customer data with transaction data (assuming 'CustomerID' is a common column)
data = pd.merge(customers, transactions, on="CustomerID")

# Step 3: Normalize numerical features (e.g., Age, Spending Amount)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data.select_dtypes(include=[np.number]))

# Step 4: Calculate Cosine Similarity
cosine_sim = cosine_similarity(scaled_data)

# Step 5: Get Top 3 Lookalikes for each of the first 20 customers
lookalikes = {}

# For the first 20 customers
for i in range(20):
    # Get the cosine similarity scores for customer i with all other customers
    similarity_scores = cosine_sim[i]
    
    # Sort the similarity scores in descending order and get the top 3 customers
    similar_customers = np.argsort(similarity_scores)[::-1][1:4]  # Exclude self (i=0)
    scores = similarity_scores[similar_customers]
    
    # Map the customer ID to a list of 3 most similar customers with scores
    lookalikes[data['CustomerID'][i]] = list(zip(data['CustomerID'][similar_customers], scores))

# Step 6: Create the Lookalike.csv file
lookalike_df = []
for cust_id, similar in lookalikes.items():
    for similar_cust, score in similar:
        lookalike_df.append([cust_id, similar_cust, score])

# Convert the list to a DataFrame
lookalike_df = pd.DataFrame(lookalike_df, columns=["CustomerID", "Lookalike_CustomerID", "Similarity_Score"])

# Save the DataFrame to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Output the first few results
print(lookalike_df.head())

  CustomerID Lookalike_CustomerID  Similarity_Score
0      C0001                C0001          1.000000
1      C0001                C0102          1.000000
2      C0001                C0165          1.000000
3      C0002                C0049          1.000000
4      C0002                C0043          0.999998
