In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def load_and_preprocess_data(transactions_path, customers_path):
    """
    Load and preprocess transaction and customer data
    """
    try:
        transactions = pd.read_csv(transactions_path)
        customers = pd.read_csv(customers_path)
        print("Data columns available:")
        print("Transactions columns:", transactions.columns.tolist())
        print("Customers columns:", customers.columns.tolist())
        return transactions, customers
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise

def engineer_features(transactions, customers):
    """
    Create comprehensive feature set using available columns
    """
    # Transaction-based features
    transaction_features = transactions.groupby("CustomerID").agg({
        "TotalValue": ["sum", "mean", "std"],
        "Quantity": ["sum", "mean", "std"],
        "ProductID": ["count", "nunique"]
    }).reset_index()
    
    # Flatten column names
    transaction_features.columns = ["CustomerID", "total_value", "avg_value", "std_value",
                                  "total_quantity", "avg_quantity", "std_quantity",
                                  "transaction_count", "unique_products"]
    
    # Calculate product purchase patterns
    product_patterns = transactions.groupby(["CustomerID", "ProductID"]).size().unstack(fill_value=0)
    product_patterns = product_patterns.div(product_patterns.sum(axis=1), axis=0)
    
    # Merge features
    feature_matrix = transaction_features.merge(customers, on="CustomerID", how="left")
    
    # If product patterns exist, merge them too
    if not product_patterns.empty:
        feature_matrix = feature_matrix.merge(product_patterns, on="CustomerID", how="left")
    
    # Fill missing values
    feature_matrix = feature_matrix.fillna(0)
    
    # Drop any non-numeric columns except CustomerID
    non_numeric_cols = feature_matrix.select_dtypes(exclude=['number']).columns
    cols_to_drop = [col for col in non_numeric_cols if col != 'CustomerID']
    feature_matrix = feature_matrix.drop(columns=cols_to_drop)
    
    print("Final features:", feature_matrix.columns.tolist())
    return feature_matrix

def build_lookalike_model(feature_matrix, n_neighbors=4, metric="cosine"):
    """
    Build and train the lookalike model
    """
    # Separate ID and features
    customer_ids = feature_matrix["CustomerID"]
    features = feature_matrix.drop("CustomerID", axis=1)
    
    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    
    # Train model
    knn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric)
    knn.fit(scaled_features)
    
    return knn, scaled_features, customer_ids

def generate_lookalikes(knn, scaled_features, customer_ids, start_id=1, end_id=20):
    """
    Generate lookalike recommendations for specified customer range
    """
    similarities = {}
    
    # Filter customer indices within the specified range
    customer_range = [i for i, cid in enumerate(customer_ids) 
                     if int(cid[1:]) >= start_id and int(cid[1:]) <= end_id]
    
    for i in customer_range:
        distances, indices = knn.kneighbors([scaled_features[i]])
        similar_customers = [(customer_ids.iloc[idx], round(float(1 - dist), 6)) 
                           for idx, dist in zip(indices[0][1:], distances[0][1:])]
        similarities[customer_ids.iloc[i]] = similar_customers[:3]
    
    return similarities

def save_results(similarities, output_path):
    """
    Save results to CSV in required format
    """
    lookalike_df = pd.DataFrame.from_dict(similarities, orient='index')
    lookalike_df.to_csv(output_path, index=True)
    return lookalike_df

def main():
    # File paths
    transactions_path = r"C:\Users\hai\Downloads\Transactions.csv"
    customers_path = r"C:\Users\hai\Downloads\Customers.csv"
    output_path = "Lookalike.csv"
    
    try:
        # Load and process data
        print("Loading data...")
        transactions, customers = load_and_preprocess_data(transactions_path, customers_path)
        
        print("\nEngineering features...")
        feature_matrix = engineer_features(transactions, customers)
        
        print("\nBuilding model...")
        knn, scaled_features, customer_ids = build_lookalike_model(feature_matrix)
        
        print("\nGenerating lookalikes...")
        similarities = generate_lookalikes(knn, scaled_features, customer_ids, 1, 20)
        
        print("\nSaving results...")
        lookalike_df = save_results(similarities, output_path)
        
        print("\nTop 3 lookalikes for first 5 customers:")
        print(lookalike_df.head())
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Loading data...
Data columns available:
Transactions columns: ['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate', 'Quantity', 'TotalValue', 'Price']
Customers columns: ['CustomerID', 'CustomerName', 'Region', 'SignupDate']

Engineering features...
Final features: ['CustomerID', 'total_value', 'avg_value', 'std_value', 'total_quantity', 'avg_quantity', 'std_quantity', 'transaction_count', 'unique_products', 'P001', 'P002', 'P003', 'P004', 'P005', 'P006', 'P007', 'P008', 'P009', 'P010', 'P011', 'P012', 'P013', 'P014', 'P015', 'P016', 'P017', 'P018', 'P019', 'P020', 'P021', 'P022', 'P023', 'P024', 'P025', 'P026', 'P027', 'P028', 'P029', 'P030', 'P031', 'P032', 'P033', 'P034', 'P035', 'P036', 'P037', 'P038', 'P039', 'P040', 'P041', 'P042', 'P043', 'P044', 'P045', 'P046', 'P047', 'P048', 'P049', 'P050', 'P051', 'P052', 'P053', 'P054', 'P055', 'P056', 'P057', 'P058', 'P059', 'P060', 'P061', 'P062', 'P063', 'P064', 'P065', 'P066', 'P067', 'P068', 'P069', 'P070', 'P071', 'P072', 'P