In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
from tqdm import tqdm
import gc
import psutil
import os

# Suppress warnings
warnings.filterwarnings('ignore')

def display_memory_usage():
    process = psutil.Process(os.getpid())
    print(f"Current memory usage: {process.memory_info().rss / 1024 / 1024:.2f} MB")

def process_chunk(chunk, label_encoders=None, first_chunk=False):
    """Process a single chunk of data"""
    if first_chunk:
        label_encoders = {}
        categorical_columns = chunk.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            label_encoders[col] = LabelEncoder()
            chunk[col] = label_encoders[col].fit_transform(chunk[col].astype(str))
    else:
        categorical_columns = label_encoders.keys()
        for col in categorical_columns:
            chunk[col] = chunk[col].astype(str)
            chunk[col] = label_encoders[col].transform(chunk[col])

    # Convert to float32 for numeric columns
    numeric_columns = chunk.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        chunk[col] = chunk[col].astype(np.float64)

    return chunk, label_encoders

try:
    print("Starting analysis...")
    display_memory_usage()

    # Create directories
    os.makedirs('Output', exist_ok=True)
    os.makedirs('data', exist_ok=True)
    os.makedirs('models', exist_ok=True)

    # Initialize variables
    chunk_size = 1000
    label_encoders = None
    first_chunk = True
    feature_names = None

    # Identify unique classes for the target variable
    print("Identifying unique classes across the dataset...")
    all_data = pd.read_csv('data/preprocessed_microsoft365_user_activity.csv', usecols=['deviceCategory'])
    unique_classes = np.unique(all_data['deviceCategory'].astype(np.float64))
    del all_data
    gc.collect()
    print(f"Unique classes identified: {unique_classes}")

    # Initialize model with reduced complexity
    model = SGDClassifier(
        loss='log_loss',  # Use 'log_loss' for classification
        penalty='l2',
        alpha=0.001,
        max_iter=1,  # Enable partial_fit
        warm_start=True,
        random_state=42
    )

    # Process data in chunks and train incrementally
    print("\nProcessing data and training model incrementally...")
    chunks = pd.read_csv('data/preprocessed_microsoft365_user_activity.csv', chunksize=chunk_size)

    accumulated_predictions = []
    accumulated_true_values = []

    for chunk_num, chunk in enumerate(tqdm(chunks)):
        chunk, label_encoders = process_chunk(chunk, label_encoders, first_chunk)
        if first_chunk:
            feature_names = chunk.columns.drop('deviceCategory').tolist()
            first_chunk = False

        X_chunk = chunk.drop('deviceCategory', axis=1)
        y_chunk = chunk['deviceCategory'].values

        # Perform incremental training
        model.partial_fit(X_chunk, y_chunk, classes=unique_classes)

        # Evaluate on every 5th chunk
        if chunk_num % 5 == 0:
            y_pred = model.predict(X_chunk)
            accumulated_predictions.extend(y_pred)
            accumulated_true_values.extend(y_chunk)

        del chunk, X_chunk, y_chunk
        gc.collect()

    # Evaluate model
    print("\nEvaluating model...")
    print("\nClassification Report:\n", 
          classification_report(accumulated_true_values, accumulated_predictions))
    print("\nConfusion Matrix:\n", 
          confusion_matrix(accumulated_true_values, accumulated_predictions))
    print("\nAccuracy Score:", 
          accuracy_score(accumulated_true_values, accumulated_predictions))

    # Save results
    print("\nSaving results...")
    results_df = pd.DataFrame({
        'Actual': accumulated_true_values,
        'Predicted': accumulated_predictions
    })
    results_df.to_csv('data/model_results.csv', index=False)

    # Save model
    import joblib
    joblib.dump(model, 'models/sgd_classifier_model.joblib')

    print("\nAnalysis completed!")
    display_memory_usage()

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    print(traceback.format_exc())


Starting analysis...
Current memory usage: 224.66 MB
Identifying unique classes across the dataset...
Unique classes identified: [0. 1. 2.]

Processing data and training model incrementally...


749it [01:27,  8.53it/s]



Evaluating model...

Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.86      0.85    113150
         1.0       0.49      0.40      0.44     32039
         2.0       0.06      0.10      0.07      4811

    accuracy                           0.74    150000
   macro avg       0.46      0.45      0.45    150000
weighted avg       0.74      0.74      0.74    150000


Confusion Matrix:
 [[97221 11508  4421]
 [16162 12760  3117]
 [ 2617  1732   462]]

Accuracy Score: 0.7362866666666666

Saving results...

Analysis completed!
Current memory usage: 234.38 MB
