In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers_file = "C:/Users/rajes/Downloads/Customers.csv"
products_file = "C:Users/rajes/Downloads/Products.csv"
transactions_file = "C:/Users/rajes/Downloads/Transactions.csv"

customers_df = pd.read_csv(customers_file)
products_df = pd.read_csv(products_file)
transactions_df = pd.read_csv(transactions_file)

# Step 1: Merge datasets to create a comprehensive view of transactions
transactions_with_products = transactions_df.merge(products_df, on="ProductID", how="left")
data = transactions_with_products.merge(customers_df, on="CustomerID", how="left")

# Step 2: Feature engineering
# Aggregate transactions by customer
customer_features = (
    data.groupby("CustomerID")
    .agg(
        total_spent=("TotalValue", "sum"),
        total_quantity=("Quantity", "sum"),
        unique_products=("ProductID", "nunique"),
        regions=("Region", "first"),  # Region is constant per customer
    )
    .reset_index()
)

# Add product category preferences
product_preferences = (
    data.groupby(["CustomerID", "Category"])
    .size()
    .unstack(fill_value=0)  # Each customer's interaction with categories
    .reset_index()
)

# Combine customer features and preferences into a single dataset
customer_data = pd.merge(customer_features, product_preferences, on="CustomerID", how="left")

# Step 3: Compute similarity using numerical features
# Select numerical features and standardize them
numerical_features = customer_data.drop(columns=["CustomerID", "regions"])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numerical_features)

# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(scaled_features)

# Step 4: Find top 3 similar customers for each customer
customer_ids = customer_data["CustomerID"].tolist()
similarity_dict = {}

for idx, customer_id in enumerate(customer_ids):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Exclude self-similarity and sort by similarity score (descending)
    sorted_scores = sorted(
        [score for score in similarity_scores if score[0] != idx],
        key=lambda x: x[1],
        reverse=True,
    )
    # Map the top 3 similar customers and their similarity scores
    top_3_similar = [(customer_ids[sim[0]], sim[1]) for sim in sorted_scores[:3]]
    similarity_dict[customer_id] = top_3_similar

# Step 5: Save lookalike results for the first 20 customers
lookalike_data = {
    "cust_id": [],
    "lookalikes": [],
}

for cust_id in customer_ids[:20]:
    lookalikes = similarity_dict[cust_id]
    lookalike_data["cust_id"].append(cust_id)
    lookalike_data["lookalikes"].append(lookalikes)

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_csv_path = "Rajesh_Balasingi_Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)

print("Lookalike model results saved to:", lookalike_csv_path)


FileNotFoundError: [Errno 2] No such file or directory: 'Customers.csv'