In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')



# Preprocess Customer Data

In [3]:

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], errors='coerce', dayfirst=True)

# Extract additional features from 'SignupDate'
customers['SignupMonth'] = customers['SignupDate'].dt.month
customers['SignupDay'] = customers['SignupDate'].dt.day
customers['SignupYear'] = customers['SignupDate'].dt.year



In [4]:
# Create categorical variables for Region (one-hot encoding)
customers = pd.get_dummies(customers, columns=['Region'], drop_first=True)

# Merge transaction data with customer information
transactions = pd.merge(transactions, customers[['CustomerID']], on='CustomerID', how='left')

# Merge product data with transaction data
transactions = pd.merge(transactions, products[['ProductID', 'Category']], on='ProductID', how='left')

# Aggregate the total quantity and value of each product purchased by each customer
customer_transactions = transactions.groupby(['CustomerID', 'Category']).agg(
    total_quantity=('Quantity', 'sum'),
    total_value=('TotalValue', 'sum')
).reset_index()



# Pivot the aggregated data to create a customer profile matrix


In [6]:
customer_profiles = customer_transactions.pivot_table(index='CustomerID', columns='Category', 
                                                     values=['total_quantity', 'total_value'], 
                                                     aggfunc='sum', fill_value=0)


# Flatten multi-level columns created by pivot_table

In [8]:
customer_profiles.columns = [f'{col[0]}_{col[1]}' for col in customer_profiles.columns]


# Handle missing values if any

In [10]:
customer_profiles = customer_profiles.fillna(0)

# Compute cosine similarity between customers

In [12]:
similarity_matrix = cosine_similarity(customer_profiles)

# Create a map of customer ID and their top 3 lookalikes with similarity scores


In [14]:
lookalike_map = {}
for i, cust_id in enumerate(customer_profiles.index[:20]):  # For first 20 customers (C0001 to C0020)
    similarity_scores = similarity_matrix[i]
    
    # Get the indices of top 3 similar customers (excluding self)
    top_indices = np.argsort(similarity_scores)[::-1][1:4]  # Exclude the self-similarity at index 0
    top_customers = customer_profiles.index[top_indices]
    top_scores = similarity_scores[top_indices]
    
    # Store the result as a list of tuples (customer_id, score)
    lookalike_map[cust_id] = [(top_customers[j], round(top_scores[j], 4)) for j in range(3)]


# Format the lookalike map into the desired format for CSV

In [16]:


lookalike_list = []
for cust_id, lookalikes in lookalike_map.items():                         # Format as [cust_id, score] in list form
    lookalike_str = ', '.join([f"[{cust}, {score}]" for cust, score in lookalikes])
    lookalike_list.append([cust_id, lookalike_str])



# Create a DataFrame to save as CSV

In [18]:

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('SAGAR_CHAUDHARY_Lookalike.csv', index=False)

print("Lookalike.csv has been created successfully!")


Lookalike.csv has been created successfully!
