In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:

"""
This script loads customer, product, and transaction data, processes it, and builds a lookalike recommendation system
using a feature-based approach and cosine similarity. The goal is to find customers with similar purchasing behavior
and generate lookalike recommendations.

Classes and Functions:
----------------------
1. load_data():
    - Loads and preprocesses the Customers, Products, and Transactions datasets.
    - Cleans data by handling duplicates, missing values, and invalid entries.
    - Ensures data consistency across CustomerID and ProductID columns.
    - Returns:
        - customers_df: Processed customer dataset.
        - products_df: Processed product dataset.
        - transactions_df: Processed transaction dataset.

2. LookalikeModel:
    - A class for building and using the lookalike recommendation model.

    Methods:
    --------
    a) __init__(customers_df, products_df, transactions_df):
        - Initializes the LookalikeModel with cleaned datasets.

    b) prepare_features():
        - Prepares a feature matrix for customers using transaction statistics and spending patterns.
        - Features include:
            - Transaction statistics: Total value, mean value, count of transactions, total quantity, mean quantity.
            - Spending percentage by product category.
        - Standardizes the features using StandardScaler.

    c) find_lookalikes(customer_id, n_recommendations=3):
        - Finds similar customers based on cosine similarity of feature vectors.
        - Arguments:
            - customer_id (str): ID of the customer for whom lookalikes are needed.
            - n_recommendations (int): Number of similar customers to recommend.
        - Returns:
            - List of dictionaries with recommended customer IDs and similarity scores.

3. generate_lookalike_recommendations():
    - Main function to generate lookalike recommendations for multiple customers.
    - Loads data using `load_data()`.
    - Initializes the LookalikeModel and prepares the feature matrix.
    - Finds lookalikes for 20 customers (C0001 to C0020).
    - Saves the results in a CSV file ('Dev_Rathore_Lookalike.csv').
    - Returns:
        - A dictionary where each key is a customer ID and the value is a list of lookalike recommendations.

Modules Used:
-------------
- pandas: For data manipulation.
- numpy: For numerical operations.
- sklearn.preprocessing.StandardScaler: To standardize the feature matrix.
- sklearn.metrics.pairwise.cosine_similarity: To compute cosine similarity between customer feature vectors.

Notes:
------
- Ensure the datasets 'Customers.csv', 'Products.csv', and 'Transactions.csv' are available in the working directory.
- Handles invalid or missing data during preprocessing to avoid errors.
- Results are saved in a CSV file for further analysis or reporting.

Example:
--------
1. To load and preprocess data:
    customers_df, products_df, transactions_df = load_data()

2. To initialize and use the lookalike model:
    model = LookalikeModel(customers_df, products_df, transactions_df)
    model.prepare_features()
    recommendations = model.find_lookalikes('C0001', n_recommendations=5)

3. To generate recommendations for multiple customers and save results:
    recommendations = generate_lookalike_recommendations()
"""

def load_data():
    customers_df = pd.read_csv('Customers.csv')
    products_df = pd.read_csv('Products.csv')
    transactions_df = pd.read_csv('Transactions.csv')

    customers_df = customers_df.drop_duplicates()
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'], errors='coerce')
    customers_df['Region'] = customers_df['Region'].fillna('Unknown')

    products_df = products_df.drop_duplicates()
    products_df['Price'] = products_df['Price'].fillna(products_df['Price'].median())
    products_df['Category'] = products_df['Category'].fillna('Other')

    transactions_df = transactions_df.drop_duplicates()
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'], errors='coerce')

    transactions_df = transactions_df.dropna(subset=['TransactionDate'])

    transactions_df['Quantity'] = pd.to_numeric(transactions_df['Quantity'], errors='coerce')
    transactions_df['Quantity'] = transactions_df['Quantity'].fillna(1)
    transactions_df.loc[transactions_df['Quantity'] <= 0, 'Quantity'] = 1

    transactions_df['Price'] = pd.to_numeric(transactions_df['Price'], errors='coerce')
    transactions_df['TotalValue'] = pd.to_numeric(transactions_df['TotalValue'], errors='coerce')

    mask = (transactions_df['TotalValue'].isna()) | (transactions_df['TotalValue'] <= 0)
    transactions_df.loc[mask, 'TotalValue'] = transactions_df.loc[mask, 'Price'] * transactions_df.loc[mask, 'Quantity']

    transactions_df = transactions_df.dropna(subset=['CustomerID', 'ProductID', 'TotalValue'])

    valid_customers = customers_df['CustomerID'].unique()
    valid_products = products_df['ProductID'].unique()

    transactions_df = transactions_df[
        transactions_df['CustomerID'].isin(valid_customers) & 
        transactions_df['ProductID'].isin(valid_products)
    ]

    for df in [customers_df, products_df, transactions_df]:
        if 'CustomerID' in df.columns:
            df['CustomerID'] = df['CustomerID'].astype(str)
        if 'ProductID' in df.columns:
            df['ProductID'] = df['ProductID'].astype(str)

    return customers_df, products_df, transactions_df



In [3]:

class LookalikeModel:
    def __init__(self, customers_df, products_df, transactions_df):
        self.customers_df = customers_df
        self.products_df = products_df
        self.transactions_df = transactions_df
        self.feature_matrix = None

    def prepare_features(self):
        merged_df = self.transactions_df.merge(self.customers_df, on='CustomerID')
        merged_df = merged_df.merge(self.products_df, on='ProductID')

        customer_features = pd.DataFrame()

        transaction_stats = merged_df.groupby('CustomerID').agg({
            'TotalValue': ['sum', 'mean', 'count'],
            'Quantity': ['sum', 'mean'],
        }).fillna(0)

        transaction_stats.columns = ['_'.join(col).strip() for col in transaction_stats.columns.values]

        category_pivot = pd.pivot_table(
            merged_df,
            values='TotalValue',
            index='CustomerID',
            columns='Category',
            aggfunc='sum',
            fill_value=0
        )
        category_totals = category_pivot.sum(axis=1)
        category_percentages = category_pivot.div(category_totals, axis=0)

        customer_features = pd.concat([
            transaction_stats,
            category_percentages
        ], axis=1)

        scaler = StandardScaler()
        self.feature_matrix = pd.DataFrame(
            scaler.fit_transform(customer_features),
            index=customer_features.index,
            columns=customer_features.columns
        )

    def find_lookalikes(self, customer_id, n_recommendations=3):
        if self.feature_matrix is None:
            self.prepare_features()

        customer_vector = self.feature_matrix.loc[customer_id].values.reshape(1, -1)

        similarities = cosine_similarity(customer_vector, self.feature_matrix)

        similar_indices = similarities[0].argsort()[::-1]
        similar_customers = []

        for idx in similar_indices:
            current_customer = self.feature_matrix.index[idx]
            if current_customer != customer_id:
                similar_customers.append({
                    'customer_id': current_customer,
                    'similarity_score': similarities[0][idx]
                })

            if len(similar_customers) == n_recommendations:
                break

        return similar_customers

def generate_lookalike_recommendations():
    customers_df, products_df, transactions_df = load_data()

    model = LookalikeModel(customers_df, products_df, transactions_df)

    recommendations = {}
    for i in range(1, 21):
        customer_id = f'C{i:04d}'
        lookalikes = model.find_lookalikes(customer_id)
        recommendations[customer_id] = [
            {'customer_id': rec['customer_id'], 'score': rec['similarity_score']}
            for rec in lookalikes
        ]

    results = []
    for cust_id, recs in recommendations.items():
        for rank, rec in enumerate(recs, 1):
            results.append({
                'source_customer': cust_id,
                'rank': rank,
                'recommended_customer': rec['customer_id'],
                'similarity_score': rec['score']
            })

    results_df = pd.DataFrame(results)
    results_df.to_csv('Dev_Rathore_Lookalike.csv', index=False)

    return recommendations

lookalike_results = generate_lookalike_recommendations()
