In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
from tqdm import tqdm
import gc
import psutil
import os

# Suppress warnings
warnings.filterwarnings('ignore')

def display_memory_usage():
    process = psutil.Process(os.getpid())
    print(f"Current memory usage: {process.memory_info().rss / 1024 / 1024:.2f} MB")

def process_chunk(chunk, label_encoders=None, first_chunk=False):
    """Process a single chunk of data"""
    if first_chunk:
        label_encoders = {}
        categorical_columns = chunk.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            label_encoders[col] = LabelEncoder()
            chunk[col] = label_encoders[col].fit_transform(chunk[col].astype(str))
    else:
        categorical_columns = label_encoders.keys()
        for col in categorical_columns:
            chunk[col] = chunk[col].astype(str)
            chunk[col] = label_encoders[col].transform(chunk[col])

    # Convert to float32 for numeric columns
    numeric_columns = chunk.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        chunk[col] = chunk[col].astype(np.float32)

    return chunk, label_encoders

try:
    print("Starting analysis...")
    display_memory_usage()

    # Create directories
    os.makedirs('output', exist_ok=True)
    os.makedirs('data', exist_ok=True)
    os.makedirs('model', exist_ok=True)

    # Initialize variables
    chunk_size = 1000
    label_encoders = None
    first_chunk = True
    feature_names = None

    # Identify unique classes for the target variable
    print("Identifying unique classes across the dataset...")
    all_data = pd.read_csv('data/processed_amazon_purchase_data.csv', usecols=['Purchase'])
    unique_classes = np.unique(all_data['Purchase'].astype(np.float32))
    del all_data
    gc.collect()
    print(f"Unique classes identified: {unique_classes}")

    # Initialize model with reduced complexity
    model = SGDClassifier(
        loss='log_loss',  # Use 'log_loss' for classification
        penalty='l2',
        alpha=0.001,
        max_iter=1,  # Enable partial_fit
        warm_start=True,
        random_state=42
    )

    # Process data in chunks and train incrementally
    print("\nProcessing data and training model incrementally...")
    chunks = pd.read_csv('data/processed_amazon_purchase_data.csv', chunksize=chunk_size)

    accumulated_predictions = []
    accumulated_true_values = []

    for chunk_num, chunk in enumerate(tqdm(chunks)):
        chunk, label_encoders = process_chunk(chunk, label_encoders, first_chunk)
        if first_chunk:
            feature_names = chunk.columns.drop('Purchase').tolist()
            first_chunk = False

        X_chunk = chunk.drop('Purchase', axis=1)
        y_chunk = chunk['Purchase'].values

        # Perform incremental training
        model.partial_fit(X_chunk, y_chunk, classes=unique_classes)

        # Evaluate on every 5th chunk
        if chunk_num % 5 == 0:
            y_pred = model.predict(X_chunk)
            accumulated_predictions.extend(y_pred)
            accumulated_true_values.extend(y_chunk)

        del chunk, X_chunk, y_chunk
        gc.collect()

    # Evaluate model
    print("\nEvaluating model...")
    print("\nClassification Report:\n", 
          classification_report(accumulated_true_values, accumulated_predictions))
    print("\nConfusion Matrix:\n", 
          confusion_matrix(accumulated_true_values, accumulated_predictions))
    print("\nAccuracy Score:", 
          accuracy_score(accumulated_true_values, accumulated_predictions))

    # Save results
    print("\nSaving results...")
    results_df = pd.DataFrame({
        'Actual': accumulated_true_values,
        'Predicted': accumulated_predictions
    })
    results_df.to_csv('data/model_results.csv', index=False)

    # Save model
    import joblib
    joblib.dump(model, 'model/sgd_classifier_model.joblib')

    print("\nAnalysis completed!")
    display_memory_usage()

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    print(traceback.format_exc())


Starting analysis...
Current memory usage: 748.02 MB
Identifying unique classes across the dataset...
Unique classes identified: [1.2000e+01 1.3000e+01 1.4000e+01 ... 2.3959e+04 2.3960e+04 2.3961e+04]

Processing data and training model incrementally...


551it [1:59:26, 13.01s/it]



Evaluating model...

Classification Report:
               precision    recall  f1-score   support

        12.0       0.00      0.00      0.00         4
        13.0       0.00      0.00      0.00         3
        14.0       0.00      0.00      0.00         5
        24.0       0.00      0.00      0.00         5
        25.0       0.00      0.00      0.00         1
        26.0       0.00      0.00      0.00         7
        36.0       0.00      0.00      0.00         2
        37.0       0.00      0.00      0.00         1
        38.0       0.00      0.00      0.00         4
        48.0       0.00      0.00      0.00         5
        49.0       0.00      0.00      0.00         5
        50.0       0.00      0.00      0.00         3
        60.0       0.00      0.00      0.00         2
        61.0       0.00      0.00      0.00         5
        62.0       0.00      0.00      0.00         6
       118.0       0.00      0.00      0.00         1
       119.0       0.00      0.00  