In [5]:
import pandas as pd # Import the pandas library and assign it the alias 'pd'

# Assuming your data is in CSV files, replace 'transactions.csv', 'products.csv', and 'customers.csv'
# with the actual file names
transactions = pd.read_csv('Transactions.csv')  # Load transactions data
products = pd.read_csv('Products.csv')        # Load products data
customers = pd.read_csv('Customers.csv')      # Load customers data


# Merge transactions with products and customers to create a unified dataset
transactions_products = pd.merge(transactions, products, on="ProductID", how="left")
full_data = pd.merge(transactions_products, customers, on="CustomerID", how="left")

# Display the first few rows of the merged dataset
full_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [6]:
# Feature engineering: Aggregate transaction and product data by customer
customer_features = full_data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean"),
    avg_quantity=("Quantity", "mean"),
    unique_categories=("Category", "nunique"),
    unique_products=("ProductID", "nunique"),
    first_signup_date=("SignupDate", "min"),  # First signup date (earliest)
    region=("Region", "first")  # Region is consistent per customer
).reset_index()

# Encode region as categorical feature for similarity computation
customer_features["region_encoded"] = customer_features["region"].astype("category").cat.codes

# Display the customer features
customer_features.head()


Unnamed: 0,CustomerID,total_spent,total_transactions,avg_transaction_value,avg_quantity,unique_categories,unique_products,first_signup_date,region,region_encoded
0,C0001,3354.52,5,670.904,2.4,3,5,2022-07-10,South America,3
1,C0002,1862.74,4,465.685,2.5,2,4,2022-02-13,Asia,0
2,C0003,2725.38,4,681.345,3.5,3,4,2024-03-07,South America,3
3,C0004,5354.88,8,669.36,2.875,3,8,2022-10-09,South America,3
4,C0005,2034.24,3,678.08,2.333333,2,3,2022-08-15,Asia,0


In [11]:
#Compute similarity scores based on these features and Identify the top 3 lookalike customers for each of the first 20 customers (C0001–C0020). ​
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Standardize numeric features for similarity computation
numeric_features = ["total_spent", "total_transactions", "avg_transaction_value",
                    "avg_quantity", "unique_categories", "unique_products"]
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features[numeric_features])

# Add region as a categorical feature (already encoded)
features_combined = pd.DataFrame(customer_features_scaled, columns=numeric_features)
features_combined["region_encoded"] = customer_features["region_encoded"].values

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(features_combined)

# Function to get top N similar customers
def get_top_similar(customer_index, similarity_matrix, top_n=3):
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    # Sort by similarity score (descending), skip self-comparison (index = customer_index)
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    return sorted_scores[1:top_n+1]

# Get top 3 similar customers for the first 20 customers (C0001–C0020)
customer_ids = customer_features["CustomerID"]
lookalike_results = {}

for i in range(20):  # First 20 customers
    similar_customers = get_top_similar(i, similarity_matrix)
    lookalike_results[customer_ids[i]] = [
        {"cust_id": customer_ids[j], "score": round(score, 4)}
        for j, score in similar_customers
    ]

# Display the lookalike results
lookalike_results


{'C0001': [{'cust_id': 'C0107', 'score': 0.996},
  {'cust_id': 'C0174', 'score': 0.9884},
  {'cust_id': 'C0011', 'score': 0.9852}],
 'C0002': [{'cust_id': 'C0060', 'score': 0.9326},
  {'cust_id': 'C0142', 'score': 0.9306},
  {'cust_id': 'C0014', 'score': 0.8984}],
 'C0003': [{'cust_id': 'C0129', 'score': 0.9413},
  {'cust_id': 'C0091', 'score': 0.9331},
  {'cust_id': 'C0026', 'score': 0.9141}],
 'C0004': [{'cust_id': 'C0099', 'score': 0.9616},
  {'cust_id': 'C0155', 'score': 0.9572},
  {'cust_id': 'C0012', 'score': 0.9571}],
 'C0005': [{'cust_id': 'C0186', 'score': 0.9774},
  {'cust_id': 'C0123', 'score': 0.9723},
  {'cust_id': 'C0159', 'score': 0.9437}],
 'C0006': [{'cust_id': 'C0026', 'score': 0.9812},
  {'cust_id': 'C0079', 'score': 0.9797},
  {'cust_id': 'C0064', 'score': 0.956}],
 'C0007': [{'cust_id': 'C0080', 'score': 0.9912},
  {'cust_id': 'C0140', 'score': 0.9734},
  {'cust_id': 'C0115', 'score': 0.9303}],
 'C0008': [{'cust_id': 'C0098', 'score': 0.9429},
  {'cust_id': 'C0147'

In [9]:
# Convert the lookalike results into the required format and save as Lookalike.csv
lookalike_map = {"cust_id": [], "lookalikes": []}

for cust_id, lookalikes in lookalike_results.items():
    lookalike_map["cust_id"].append(cust_id)
    lookalike_map["lookalikes"].append(lookalikes)

# Create a DataFrame for the lookalike map
lookalike_df = pd.DataFrame(lookalike_map)

# Save to CSV
lookalike_csv_path = "Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)

lookalike_csv_path


'Lookalike.csv'