In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import warnings
import gc
import os
from sklearn.preprocessing import KBinsDiscretizer


warnings.filterwarnings('ignore')

def preprocess_data(df):
    """
    Comprehensive data preprocessing function

    Steps:
    1. Remove duplicates
    2. Handle categorical variables
    3. Detect and handle skewness/outliers
    4. Balance dataset
    5. Feature engineering
    """
    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Categorical encoding
    categorical_columns = df.select_dtypes(include=['object']).columns
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    # Detect skewness and transform if needed
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    skewed_columns = df[numeric_columns].apply(lambda x: np.abs(x.skew()) > 1)

    for col in skewed_columns[skewed_columns].index:
        df[col] = np.log1p(df[col])

    # Outlier handling using RobustScaler
    scaler = RobustScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

    return df, label_encoders

def balance_dataset(X, y):
    """
    Use SMOTE for class balancing
    """
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

def dimensionality_reduction(X, n_components=0.95):
    """
    Apply PCA for dimensionality reduction
    """
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X)
    return X_reduced, pca

def train_and_evaluate_model(X_train, X_test, y_train, y_test):
    """
    Train SGDClassifier and provide detailed evaluation
    """
    model = SGDClassifier(
        loss='log_loss',
        penalty='l2',
        alpha=0.001,
        max_iter=1000,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Detailed model evaluation
    print("\nModel Performance Metrics:")
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Confusion Matrix Visualization
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.savefig('output/confusion_matrix.png')
    plt.close()

    return model

def convert_to_discrete_classes(y, n_bins=5):
    """
    Convert continuous target variable to discrete classes

    Args:
    y (array-like): Continuous target values
    n_bins (int): Number of bins to discretize the target

    Returns:
    Discretized target as integer classes
    """
    # Use KBinsDiscretizer to convert continuous values to discrete classes
    discretizer = KBinsDiscretizer(
        n_bins=n_bins,
        encode='ordinal',  # Output as integer classes
        strategy='quantile'  # Equal number of samples in each bin
    )

    # Reshape to 2D array if needed
    y_reshaped = y.values.reshape(-1, 1) if hasattr(y, 'values') else y.reshape(-1, 1)

    # Fit and transform
    y_discrete = discretizer.fit_transform(y_reshaped).ravel().astype(int)

    return y_discrete

def main():
    # Load data
    df = pd.read_csv('data/processed_amazon_purchase_data.csv')

    # Preprocess data
    df, label_encoders = preprocess_data(df)

    # Convert Purchase to discrete classes
    y = convert_to_discrete_classes(df['Purchase'])
    X = df.drop('Purchase', axis=1)

    # # Split features and target
    # X = df.drop('Purchase', axis=1)
    # y = df['Purchase']

    # Dimensionality reduction
    X_reduced, pca = dimensionality_reduction(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

    # Balance dataset
    X_train_balanced, y_train_balanced = balance_dataset(X_train, y_train)

    # Train and evaluate model
    final_model = train_and_evaluate_model(X_train_balanced, X_test, y_train_balanced, y_test)

    # Save model and results
    import joblib
    joblib.dump(final_model, 'model/sgd_classifier_model.joblib')
    joblib.dump(pca, 'model/pca_transformer.joblib')

if __name__ == "__main__":
    main()


Model Performance Metrics:
Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.56      0.48     22094
           1       0.43      0.50      0.46     22008
           2       0.00      0.00      0.00     22065
           3       0.42      0.51      0.46     22005
           4       0.56      0.72      0.63     21842

    accuracy                           0.46    110014
   macro avg       0.36      0.46      0.41    110014
weighted avg       0.36      0.46      0.40    110014

