In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load and Merge Datasets


# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [4]:
# Merge datasets for analysis
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [5]:
# Feature Engineering


# Aggregating transactional data for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'Quantity': 'sum',                     # Total quantity purchased
    'TotalValue': 'sum',                   # Total revenue generated
    'ProductID': 'nunique',                # Number of unique products purchased
    'Category': 'nunique'                  # Number of unique product categories
}).reset_index()

In [6]:
# Add profile features from the customers dataset
customer_profiles = customers[['CustomerID', 'Region']]
customer_features = customer_features.merge(customer_profiles, on="CustomerID", how="left")

In [7]:
# One-hot encode the 'Region' feature
customer_features = pd.get_dummies(customer_features, columns=['Region'])


In [8]:
# Normalize the Data


# Normalize numerical columns (Quantity, TotalValue, etc.)
scaler = MinMaxScaler()
numerical_columns = ['Quantity', 'TotalValue', 'ProductID', 'Category']
customer_features[numerical_columns] = scaler.fit_transform(customer_features[numerical_columns])

In [9]:
# Calculate Similarity Scores


# Compute the cosine similarity between customers
customer_ids = customer_features['CustomerID']
feature_matrix = customer_features.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(feature_matrix)

In [10]:
# Generate Lookalike Recommendations


# Function to get top 3 similar customers
def get_top_3_similar(customers_matrix, customer_index):
    similarity_scores = customers_matrix[customer_index]
    # Get indices of the top 3 most similar customers (excluding the customer itself)
    similar_indices = np.argsort(-similarity_scores)[1:4]
    return [(customer_ids[i], similarity_scores[i]) for i in similar_indices]

In [11]:
# Create the Lookalike.csv for CustomerIDs C0001 to C0020
lookalike_results = {}
for i in range(20):  # First 20 customers (indices 0 to 19)
    customer_id = customer_ids.iloc[i]
    top_3 = get_top_3_similar(similarity_matrix, i)
    lookalike_results[customer_id] = top_3

In [12]:
# Convert results to DataFrame
lookalike_df = pd.DataFrame({
    "CustomerID": [k for k, v in lookalike_results.items() for _ in v],
    "SimilarCustomerID": [sim[0] for v in lookalike_results.values() for sim in v],
    "SimilarityScore": [sim[1] for v in lookalike_results.values() for sim in v]
})

In [13]:
# Save results to CSV
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)

In [14]:
# Output Results


print("Lookalike Recommendations for the First 20 Customers:")
print(lookalike_df.head(20))
print("\nLookalike recommendations saved as 'FirstName_LastName_Lookalike.csv'.")

Lookalike Recommendations for the First 20 Customers:
   CustomerID SimilarCustomerID  SimilarityScore
0       C0001             C0107         0.999837
1       C0001             C0174         0.999508
2       C0001             C0011         0.999469
3       C0002             C0142         0.999222
4       C0002             C0159         0.994031
5       C0002             C0186         0.992423
6       C0003             C0174         0.995462
7       C0003             C0001         0.994702
8       C0003             C0031         0.994469
9       C0004             C0099         0.996398
10      C0004             C0087         0.995150
11      C0004             C0013         0.993646
12      C0005             C0186         0.999737
13      C0005             C0159         0.999593
14      C0005             C0007         0.998621
15      C0006             C0158         0.997079
16      C0006             C0187         0.996405
17      C0006             C0011         0.996040
18      C0007  