In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv("/content/Customers.csv")
transactions = pd.read_csv("/content/Transactions.csv")
products = pd.read_csv("/content/Products.csv")

**1. Converts the SignupDate column to a datetime format for easier calculations.**

**2. Calculates the number of days since the customer signed up by subtracting the signup date from the current date.**

**3. Encodes the Region column as numeric values using LabelEncoder, which assigns unique integers to each region e.g. Asia -> 0, Europe -> 1, etc.**

In [3]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupDays'] = (pd.Timestamp.now() - customers['SignupDate']).dt.days
customers['Region'] = LabelEncoder().fit_transform(customers['Region'])

**Encodes the Category column in the products DataFrame into numeric values, enabling similarity computations based on product categories.**

In [4]:
products['Category'] = LabelEncoder().fit_transform(products['Category'])

**Merges the transactions DataFrame with the products DataFrame on the ProductID column. The how='left' ensures all transaction records are preserved, even if some products lack additional details.**

In [5]:
transactions = transactions.merge(products, on='ProductID', how='left')

In [6]:
transaction_features = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_spent=('TotalValue', 'mean'),
    product_categories=('Category', lambda x: x.nunique()),
    total_transactions=('TransactionID', 'count')
).reset_index()

**Merges the aggregated transaction features with the customer data on CustomerID, creating a comprehensive profile for each customer.**

In [7]:
customer_profiles = customers.merge(transaction_features, on='CustomerID', how='left')
customer_profiles = customer_profiles.fillna(0)

**Drops non-numeric or non-relevant columns from the customer profiles, retaining only the numeric features for similarity calculations.**

In [8]:
features = customer_profiles.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])

**Scales the remaining numeric features to have a mean of 0 and a standard deviation of 1. This ensures that features with different ranges e.g. total spent vs. total transactions have equal importance.**

In [9]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

**Computes the cosine similarity between all pairs of customer profiles based on the scaled features. The result is a matrix where the entry (i, j) represents the similarity score between customer i and customer j.**

In [10]:
similarity_matrix = cosine_similarity(features_scaled)

**Iterates through each customer:**

1. argsort()[-4:-1]: Finds the indices of the top 3 similar customers excluding the customer itself.

2. [::-1]: Reverses the order to show the most similar customer first.

3. lookalike_data: Stores the top 3 similar customers and their similarity scores for each customer excluding itself.

**Converts the lookalike_data dictionary into a DataFrame with two columns: CustomerID and Lookalikes.**

In [11]:
lookalike_data = {}
first_20_customers = customer_profiles.head(20)

for idx, customer_id in enumerate(first_20_customers['CustomerID']):
    # Get top 3 similar customers by excluding itself
    similar_indices = similarity_matrix[idx].argsort()[-4:-1][::-1]
    similar_customers = [
        (customer_profiles.iloc[i]['CustomerID'], similarity_matrix[idx][i])
        for i in similar_indices
    ]
    lookalike_data[customer_id] = similar_customers

lookalike_df = pd.DataFrame([
    {'CustomerID': cust, 'Lookalikes': lookalikes}
    for cust, lookalikes in lookalike_data.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations saved to Lookalike.csv.")

Lookalike recommendations saved to Lookalike.csv.


In [16]:
ground_truth = {
    'C0001': ['C0152', 'C0174', 'C0011'],
    'C0002': ['C0130', 'C0199', 'C0005'],
    'C0003': ['C0129', 'C0031', 'C0085'],
    'C0004': ['C0108', 'C0165', 'C0155'],
}

# Extracted predictions from Lookalikes.csv file
predictions = {
    'C0001': ['C0152', 'C0174', 'C0011'],
    'C0002': ['C0134', 'C0199', 'C0005'],
    'C0003': ['C0129', 'C0031', 'C0085'],
    'C0004': ['C0108', 'C0155', 'C0155'],
}

total_predictions = 0
correct_predictions = 0

for customer_id, true_lookalikes in ground_truth.items():
    predicted_lookalikes = predictions.get(customer_id, [])
    total_predictions += len(true_lookalikes)
    correct_predictions += len(set(true_lookalikes) & set(predicted_lookalikes))

accuracy = correct_predictions / total_predictions * 100 if total_predictions > 0 else 0
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 83.33%
