In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("/content/drive/My Drive/Zeotap/Customers.csv")
products = pd.read_csv("/content/drive/My Drive/Zeotap/Products.csv")
transactions = pd.read_csv("/content/drive/My Drive/Zeotap/Transactions.csv")

# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], format='%d-%m-%Y', errors='coerce')
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'], format='%d-%m-%Y %H:%M', errors='coerce')

# Merge datasets to combine customer, product, and transaction information
data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

# Feature Engineering: Create customer profiles
# 1. Aggregate total spend per customer and transaction frequency
customer_profile = data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'nunique'),
    avg_transaction_value=('TotalValue', 'mean'),
    avg_quantity=('Quantity', 'mean'),
    region=('Region', 'first'),
    category_preference=('Category', lambda x: x.mode()[0])  # Most common category
).reset_index()

# 2. One-hot encoding for categorical variables like region and category
customer_profile = pd.get_dummies(customer_profile, columns=['region', 'category_preference'], drop_first=True)

# 3. Normalize numerical features for cosine similarity
scaler = StandardScaler()
customer_profile[['total_spent', 'transaction_count', 'avg_transaction_value', 'avg_quantity']] = scaler.fit_transform(
    customer_profile[['total_spent', 'transaction_count', 'avg_transaction_value', 'avg_quantity']])

# Similarity Calculation using Cosine Similarity
# We will use all numerical features for calculating similarity
profile_features = customer_profile.drop('CustomerID', axis=1)
similarity_matrix = cosine_similarity(profile_features)

# Function to recommend similar customers
# Function to recommend similar customers
def recommend_similar_customers(customer_id, top_n=3):
    # Find the index of the customer_id
    customer_idx = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]

    # Get the similarity scores of the target customer with all other customers
    similarity_scores = similarity_matrix[customer_idx]

    # Sort the similarity scores in descending order and pick top_n customers
    similar_customer_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]

    # Retrieve the recommended customers and their similarity scores
    similar_customers = customer_profile.iloc[similar_customer_indices].copy()  # Fix by copying slice

    # Now assign similarity scores safely
    similar_customers['similarity_score'] = similarity_scores[similar_customer_indices]

    return similar_customers[['CustomerID', 'similarity_score']]


# Example: Recommend similar customers for a specific CustomerID
customer_id = 'C0199'  # Replace with the desired customer ID
recommendations = recommend_similar_customers(customer_id, top_n=3)

# Show the recommendations
print("Top 3 Similar Customers for CustomerID:", customer_id)
print(recommendations)

# Save the recommendations to a CSV file
recommendations.to_csv('S_NITHISHKUMAR_Lookalike.csv', index=False)

# Confirm the file is saved
print("Recommendations saved to S_NITHISHKUMAR_Lookalike.csv")

Top 3 Similar Customers for CustomerID: C0199
    CustomerID  similarity_score
68       C0069          0.893838
118      C0119          0.825640
120      C0121          0.811931
Recommendations saved to S_NITHISHKUMAR_Lookalike.csv
