In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load Data
Cust_df = pd.read_csv('C:/Users/LOKESH/OneDrive/Desktop/DATA SCIENCE ASSESMENT/Customers.csv')
Prod_df = pd.read_csv('C:/Users/LOKESH/OneDrive/Desktop/DATA SCIENCE ASSESMENT/Products.csv')
Trans_df = pd.read_csv('C:/Users/LOKESH/OneDrive/Desktop/DATA SCIENCE ASSESMENT/Transactions.csv')


In [13]:
# Data Preprocessing: Merge customer data with transaction data
merged_df = pd.merge(Trans_df, Prod_df, on='ProductID')
merged_df = pd.merge(merged_df, Cust_df, on='CustomerID')


In [14]:
# 1. Extract customer profile features
Cust_df['SignupYear'] = pd.to_datetime(Cust_df['SignupDate']).dt.year
Cust_df['YearsSinceSignup'] = 2025 - Cust_df['SignupYear']  # Assuming the current year is 2025

# 2. Transaction history features: Total spending per category per customer
transaction_features = merged_df.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)

# Normalize features
scaler = StandardScaler()
transaction_features_scaled = pd.DataFrame(scaler.fit_transform(transaction_features), columns=transaction_features.columns, index=transaction_features.index)

# Merge customer profile and transaction features
customer_profiles = pd.merge(Cust_df[['CustomerID', 'Region', 'YearsSinceSignup']], transaction_features_scaled, on='CustomerID')

# Encode 'Region' as a numeric feature (one-hot encoding or label encoding)
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])

# Create a feature matrix by combining customer profile and transaction history
features = customer_profiles.drop(columns=['CustomerID'])

# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(features)

# Create a DataFrame to hold the similarity scores for easier processing
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])


In [15]:
# Function to get top N similar customers
def get_top_similar_customers(cust_id, n=3):
    similarity_scores = similarity_df[cust_id].sort_values(ascending=False)
    top_n_similar = similarity_scores[1:n+1]  # Exclude the customer itself (index 0)
    return top_n_similar

In [16]:
# Get top 3 lookalikes for customers C0001 to C0020
lookalike_recommendations = {}
for cust_id in [f'C{i:04d}' for i in range(1, 21)]:
    lookalike_recommendations[cust_id] = get_top_similar_customers(cust_id)


In [17]:
# Convert the results into a format suitable for export (cust_id, lookalike_id, score)
lookalike_list = []
for cust_id, similar_customers in lookalike_recommendations.items():
    for similar_cust_id, score in similar_customers.items():
        lookalike_list.append([cust_id, similar_cust_id, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

In [18]:
# Save the recommendations to a CSV file
lookalike_df.to_csv('C:/Users/LOKESH/OneDrive/Desktop/DATA SCIENCE ASSESMENT/Lookalike.csv', index=False)

print("Lookalike model recommendations saved to 'Lookalike.csv'.")

Lookalike model recommendations saved to 'Lookalike.csv'.
