In [8]:
import pandas as pd

# File paths
customers_file = "/content/Customers.csv"
products_file = "/content/Products.csv"
transactions_file = "/content/Transactions.csv"

# Load the datasets
customers_df = pd.read_csv(customers_file)
products_df = pd.read_csv(products_file)
transactions_df = pd.read_csv(transactions_file)

# Display first few rows of each dataset
customers_df.head(), products_df.head(), transactions_df.head()


(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

In [9]:
# Merge transactions with products
transactions_merged = transactions_df.merge(products_df, on="ProductID", how="left")

# Merge transactions with customers
transactions_merged = transactions_merged.merge(customers_df, on="CustomerID", how="left")

# Display first few rows of the merged dataset
transactions_merged.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [10]:
from datetime import datetime



In [11]:
#Convert TransactionDate to datetime format
transactions_merged["TransactionDate"] = pd.to_datetime(transactions_merged["TransactionDate"])

In [13]:
# Compute total spending per customer
customer_spending = transactions_merged.groupby("CustomerID")["TotalValue"].sum().rename("TotalSpending")


In [14]:
customer_frequency = transactions_merged.groupby("CustomerID")["TransactionID"].count().rename("PurchaseFrequency")

In [15]:
# Compute recency (days since last purchase from the most recent transaction date)
most_recent_date = transactions_merged["TransactionDate"].max()
customer_recency = transactions_merged.groupby("CustomerID")["TransactionDate"].max().apply(lambda x: (most_recent_date - x).days).rename("RecencyDays")

In [16]:
# Compute category preference (most frequently purchased category per customer)
customer_category = transactions_merged.groupby(["CustomerID", "Category"]).size().reset_index(name="Count")
customer_top_category = customer_category.loc[customer_category.groupby("CustomerID")["Count"].idxmax()][["CustomerID", "Category"]].rename(columns={"Category": "TopCategory"})

In [17]:
# Combine all features into a single customer profile dataset
customer_features = pd.DataFrame(customer_spending).merge(customer_frequency, on="CustomerID") \
                                                   .merge(customer_recency, on="CustomerID") \
                                                   .merge(customer_top_category, on="CustomerID")

In [18]:
# Display first few rows of the customer features dataset
customer_features.head()

Unnamed: 0,CustomerID,TotalSpending,PurchaseFrequency,RecencyDays,TopCategory
0,C0001,3354.52,5,55,Electronics
1,C0002,1862.74,4,25,Clothing
2,C0003,2725.38,4,125,Home Decor
3,C0004,5354.88,8,4,Books
4,C0005,2034.24,3,54,Electronics


In [29]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# Select the first 20 customers (C0001 - C0020)
target_customers = customer_features[customer_features["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]

# One-hot encode the TopCategory feature
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
category_encoded = encoder.fit_transform(customer_features[["TopCategory"]])
category_encoded_df = pd.DataFrame(category_encoded, columns=encoder.get_feature_names_out(["TopCategory"]))

# Normalize numerical features
scaler = StandardScaler()
numeric_features = customer_features[["TotalSpending", "PurchaseFrequency", "RecencyDays"]]
normalized_numeric = scaler.fit_transform(numeric_features)
numeric_features_df = pd.DataFrame(normalized_numeric, columns=["TotalSpending", "PurchaseFrequency", "RecencyDays"])

# Combine encoded categorical and normalized numerical features
final_features = pd.concat([customer_features[["CustomerID"]], numeric_features_df, category_encoded_df], axis=1)

# Compute similarity matrix
feature_matrix = final_features.drop(columns=["CustomerID"])
similarity_matrix = cosine_similarity(feature_matrix)

# Convert similarity matrix to a DataFrame for easier lookup
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

# Generate the top 3 similar customers for each of the first 20 customers
lookalike_dict = {}
for cust_id in target_customers["CustomerID"]:
    similar_customers = similarity_df[cust_id].nlargest(4).iloc[1:4]  # Exclude self (largest value)

# Store lookalikes and scores separately
    lookalike_ids = similar_customers.index.tolist()
    scores = similar_customers.values.tolist()

    # Add to dictionary
    lookalike_dict[cust_id] = [lookalike_ids[0], scores[0], lookalike_ids[1], scores[1], lookalike_ids[2], scores[2]]

# Convert the dictionary to a DataFrame
lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient="index")
lookalike_df.reset_index(inplace=True)
lookalike_df.columns = ["CustomerID", "Lookalike1", "Score1", "Lookalike2", "Score2", "Lookalike3", "Score3"]

# Save the results to a CSV file
lookalike_csv_path = "/mnt/data/Lookalike.csv"

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(lookalike_csv_path), exist_ok=True)

lookalike_df.to_csv(lookalike_csv_path, index=False)

# Display the first few rows of the lookalike recommendations
lookalike_df.head()


Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0072,0.983341,C0048,0.910075,C0069,0.903093
1,C0002,C0010,0.988362,C0029,0.987049,C0083,0.966212
2,C0003,C0052,0.99219,C0025,0.942436,C0160,0.936688
3,C0004,C0101,0.999137,C0122,0.987135,C0017,0.984173
4,C0005,C0186,0.991882,C0140,0.969255,C0199,0.959462



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

