In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 

# Import necessary libraries from scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

# --- CUDA Imports (cuML) Removed ---
# NOTE: The attempt to install cuML failed (as expected for pip). 
# We are reverting to standard scikit-learn components for universal compatibility.
# The code below will now use the standard Pipeline, TfidfVectorizer, and LinearSVC.
# CUDA_ENABLED = False is implicit, and we rely entirely on CPU-based libraries.

# Download necessary NLTK components (run this once)
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# --- Section 6.2: From Raw Data to Clean Text ---

print("1. Loading Data...")
# 1. Load Data: The file is a zipped CSV, which pandas handles automatically.
try:
    df = pd.read_csv('https://files.consumerfinance.gov/ccdb/complaints.csv.zip')
except Exception as e:
    # If download fails, report a fatal error and re-raise.
    print(f"Fatal Error: Could not load data from URL. Please ensure you have internet access and the URL is correct. Error: {e}")
    raise


print("2. Filtering and Cleaning...")
# 2. Filter and Clean
# Filter the DataFrame to keep only the required columns and non-null narratives.
df = df[['Product', 'Consumer complaint narrative']].copy()
df.columns = ['Product', 'Narrative']

# Drop rows with missing narratives
df.dropna(subset=['Narrative'], inplace=True)

# Define the required classification categories
TARGET_CATEGORIES = [
    'Credit reporting, credit repair services, or other personal consumer reports',
    'Debt collection',
    'Consumer Loan',
    'Mortgage'
]

# Filter the DataFrame to include only the target categories
df = df[df['Product'].isin(TARGET_CATEGORIES)].copy()

# Reduce category names for readability (optional but good practice)
df['Product'] = df['Product'].replace({
    'Credit reporting, credit repair services, or other personal consumer reports': 'Credit Report/Repair',
    'Debt collection': 'Debt Collection',
    'Consumer Loan': 'Consumer Loan',
    'Mortgage': 'Mortgage'
})

# Display class balance
print(f"Total relevant complaints: {len(df)}")
print("Class distribution:\n", df['Product'].value_counts())


# 3. Preprocess Text Function
def preprocess_text(text):
    """Converts text to lowercase, removes punctuation/numbers, and stops."""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers (keep only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

# Apply preprocessing to the Narrative column
df['Clean_Narrative'] = df['Narrative'].apply(preprocess_text)


# --- Section 6.3 & 6.4: Feature Engineering and Training ---

print("\n3. Splitting Data and Preparing Pipelines...")
# Define features (X) and target (y)
X = df['Clean_Narrative']
y = df['Product']

# 1. Split Data: Divide the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# 2. Create Pipelines and 3. Train Models
# Define a single feature extractor (TF-IDF Vectorizer)
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=5)

# Define classifiers 
classifiers = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'),
    'LinearSVC': LinearSVC(random_state=42, dual=False) # Re-added dual=False for scikit-learn
}

results = []

# Train and evaluate each model in a pipeline
best_accuracy = -1
for name, classifier in classifiers.items():
    print(f"   -> Training {name}...")
    
    # Create the standard scikit-learn Pipeline
    current_pipeline = Pipeline([
        ('tfidf', tfidf),
        ('classifier', classifier)
    ])
    
    # Train the pipeline
    current_pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = current_pipeline.predict(X_test)
    
    # No conversion needed as y_pred is already a numpy array/pandas series
    
    # Evaluate performance
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results for comparison table
    results.append({
        'Model': name,
        'Accuracy': round(accuracy, 4),
        'Precision (macro)': round(report['macro avg']['precision'], 4),
        'Recall (macro)': round(report['macro avg']['recall'], 4),
        'F1-Score (macro)': round(report['macro avg']['f1-score'], 4)
    })
    
    # Check if this is the best model based on accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name
        best_model_pipeline = current_pipeline
        best_y_pred = y_pred


# 4. Compare Performance: Table 4
comparison_df = pd.DataFrame(results)
print("\n4. Model Performance Comparison (Table 4):")
print(comparison_df.to_markdown(index=False))


# --- Section 6.5: Evaluating Model Performance (Best Model) ---

print(f"\n5. Detailed Evaluation of the Best Model ({best_model_name})...")

# 1. Select Best Model (Dynamically selected based on highest accuracy)
# best_model_pipeline holds the best trained pipeline.

# 2. Classification Report
print("\nClassification Report (Per Category Breakdown):")
print(classification_report(y_test, best_y_pred, target_names=y.unique(), zero_division=0))


# 3. Confusion Matrix
cm = confusion_matrix(y_test, best_y_pred, labels=y.unique())
cm_df = pd.DataFrame(cm, index=y.unique(), columns=y.unique())

plt.figure(figsize=(10, 7))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', linewidths=.5, linecolor='black')
plt.title(f'Confusion Matrix for {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# 4. Model Persistence: Save the best model pipeline
MODEL_FILENAME = 'best_text_classifier_pipeline.pkl'
try:
    with open(MODEL_FILENAME, 'wb') as file:
        pickle.dump(best_model_pipeline, file)
    print(f"\nModel Saved Successfully: The best model ({best_model_name} Pipeline) was saved to {MODEL_FILENAME}")
except Exception as e:
    print(f"\nError saving model: {e}")

print("\nTask 5 Complete: Classification models ha# Cell 1: Check CUDA and Import Libraries")
print("=" * 60)
print("CUDA-Accelerated Text Classification")
print("=" * 60)

import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from IPython.display import display, HTML

# Check CUDA availability
try:
    import cupy as cp
    print(f"✓ CUDA Available: {cp.cuda.is_available()}")
    if cp.cuda.is_available():
        print(f"✓ GPU Device: {cp.cuda.Device().name.decode()}")
        print(f"✓ GPU Memory: {cp.cuda.Device().mem_info[1] / 1e9:.2f} GB total")
    CUDA_AVAILABLE = True
except ImportError:
    print("✗ CuPy not installed. Please install RAPIDS cuML.")
    print("  Install with: conda install -c rapidsai -c conda-forge -c nvidia cuml cupy")
    CUDA_AVAILABLE = False

print("\nImporting libraries...")

# Import cuML (RAPIDS) for GPU
if CUDA_AVAILABLE:
    from cuml.naive_bayes import MultinomialNB as cuMultinomialNB
    from cuml.linear_model import LogisticRegression as cuLogisticRegression
    from cuml.svm import LinearSVC as cuLinearSVC
    print("✓ cuML imported successfully")

# Import scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Download NLTK stopwords
try:
    nltk.data.find('corpora/stopwords')
    print("✓ NLTK stopwords available")
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords', quiet=True)

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

print("\n✓ All libraries imported successfully!")

# ============================================================================
# Cell 2: Load and Filter Data
print("\n" + "=" * 60)
print("LOADING AND FILTERING DATA")
print("=" * 60)

print("\n📥 Downloading dataset from Consumer Finance...")
try:
    df = pd.read_csv('https://files.consumerfinance.gov/ccdb/complaints.csv.zip')
    print(f"✓ Dataset loaded: {len(df):,} total complaints")
except Exception as e:
    print(f"✗ Error loading data: {e}")
    raise

# Filter columns
df = df[['Product', 'Consumer complaint narrative']].copy()
df.columns = ['Product', 'Narrative']

print(f"\n🔍 Filtering data...")
print(f"   Before filtering: {len(df):,} rows")

# Drop missing narratives
df.dropna(subset=['Narrative'], inplace=True)
print(f"   After dropping nulls: {len(df):,} rows")

# Define target categories
TARGET_CATEGORIES = [
    'Credit reporting, credit repair services, or other personal consumer reports',
    'Debt collection',
    'Consumer Loan',
    'Mortgage'
]

df = df[df['Product'].isin(TARGET_CATEGORIES)].copy()
print(f"   After category filter: {len(df):,} rows")

# Simplify category names
df['Product'] = df['Product'].replace({
    'Credit reporting, credit repair services, or other personal consumer reports': 'Credit Report/Repair',
    'Debt collection': 'Debt Collection',
    'Consumer Loan': 'Consumer Loan',
    'Mortgage': 'Mortgage'
})

print("\n📊 Class Distribution:")
display(df['Product'].value_counts().to_frame('Count'))

# ============================================================================
# Cell 3: Text Preprocessing
print("\n" + "=" * 60)
print("TEXT PREPROCESSING")
print("=" * 60)

def preprocess_text(text):
    """Converts text to lowercase, removes punctuation/numbers, and stopwords."""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

print("\n🔧 Cleaning text data...")
print("   Operations: lowercase → remove punctuation → remove stopwords")

# Show example before/after
sample_idx = 0
print(f"\n📝 Example transformation:")
print(f"   BEFORE: {df['Narrative'].iloc[sample_idx][:150]}...")
df['Clean_Narrative'] = df['Narrative'].apply(preprocess_text)
print(f"   AFTER:  {df['Clean_Narrative'].iloc[sample_idx][:150]}...")

print(f"\n✓ Preprocessing complete!")

# ============================================================================
# Cell 4: Prepare Data for GPU Training
print("\n" + "=" * 60)
print("PREPARING DATA FOR GPU TRAINING")
print("=" * 60)

X = df['Clean_Narrative']
y = df['Product']

# Split data
print("\n✂️  Splitting data (75% train, 25% test)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
print(f"   Training samples: {len(X_train):,}")
print(f"   Testing samples:  {len(X_test):,}")

# TF-IDF Vectorization
print("\n🔤 Vectorizing text with TF-IDF...")
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=5)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print(f"   Vocabulary size: {len(tfidf.vocabulary_):,} features")
print(f"   Train matrix shape: {X_train_tfidf.shape}")
print(f"   Test matrix shape: {X_test_tfidf.shape}")

# Convert to dense and transfer to GPU
print("\n🚀 Transferring data to GPU...")
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

if CUDA_AVAILABLE:
    X_train_gpu = cp.asarray(X_train_dense, dtype=cp.float32)
    X_test_gpu = cp.asarray(X_test_dense, dtype=cp.float32)
    
    gpu_memory_train = X_train_gpu.nbytes / 1e6
    gpu_memory_test = X_test_gpu.nbytes / 1e6
    print(f"   GPU memory used (train): {gpu_memory_train:.2f} MB")
    print(f"   GPU memory used (test):  {gpu_memory_test:.2f} MB")
    
    # Encode labels
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)
    
    y_train_gpu = cp.asarray(y_train_encoded, dtype=cp.int32)
    y_test_gpu = cp.asarray(y_test_encoded, dtype=cp.int32)
    
    print(f"   Classes: {list(le.classes_)}")
    print("\n✓ Data ready for GPU training!")
else:
    print("✗ CUDA not available, cannot proceed with GPU training")

# ============================================================================
# Cell 5: Train GPU-Accelerated Models
print("\n" + "=" * 60)
print("TRAINING GPU-ACCELERATED MODELS")
print("=" * 60)

if not CUDA_AVAILABLE:
    print("✗ Skipping training - CUDA not available")
else:
    # Define classifiers
    classifiers = {
        'MultinomialNB': cuMultinomialNB(),
        'LogisticRegression': cuLogisticRegression(max_iter=1000),
        'LinearSVC': cuLinearSVC(max_iter=1000)
    }
    
    results = []
    best_accuracy = -1
    trained_models = {}
    
    # Train each model
    for name, classifier in classifiers.items():
        print(f"\n🔄 Training {name} on GPU...")
        
        import time
        start_time = time.time()
        
        # Train
        classifier.fit(X_train_gpu, y_train_gpu)
        train_time = time.time() - start_time
        
        # Predict
        y_pred_gpu = classifier.predict(X_test_gpu)
        
        # Transfer back to CPU
        y_pred_cpu = cp.asnumpy(y_pred_gpu).astype(int)
        y_pred = le.inverse_transform(y_pred_cpu)
        
        # Evaluate
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"   ✓ Training time: {train_time:.2f}s")
        print(f"   ✓ Accuracy: {accuracy:.4f}")
        
        results.append({
            'Model': name,
            'Training Time (s)': round(train_time, 2),
            'Accuracy': round(accuracy, 4),
            'Precision (macro)': round(report['macro avg']['precision'], 4),
            'Recall (macro)': round(report['macro avg']['recall'], 4),
            'F1-Score (macro)': round(report['macro avg']['f1-score'], 4)
        })
        
        # Store model and predictions
        trained_models[name] = {
            'model': classifier,
            'predictions': y_pred
        }
        
        # Track best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = name
            best_model = classifier
            best_y_pred = y_pred
    
    # Display results
    print("\n" + "=" * 60)
    print("MODEL PERFORMANCE COMPARISON")
    print("=" * 60)
    comparison_df = pd.DataFrame(results)
    display(comparison_df)
    
    print(f"\n🏆 Best Model: {best_model_name} (Accuracy: {best_accuracy:.4f})")

# ============================================================================
# Cell 6: Detailed Evaluation of Best Model
if CUDA_AVAILABLE:
    print("\n" + "=" * 60)
    print(f"DETAILED EVALUATION: {best_model_name}")
    print("=" * 60)
    
    print("\n📋 Classification Report:")
    print(classification_report(y_test, best_y_pred, target_names=le.classes_, zero_division=0))
    
    # Confusion Matrix
    print("\n📊 Confusion Matrix:")
    cm = confusion_matrix(y_test, best_y_pred, labels=le.classes_)
    cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
    
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', linewidths=.5, linecolor='black')
    plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    # Per-class accuracy
    print("\n📈 Per-Class Accuracy:")
    class_accuracies = []
    for i, class_name in enumerate(le.classes_):
        class_acc = cm[i, i] / cm[i, :].sum()
        class_accuracies.append({
            'Class': class_name,
            'Accuracy': f"{class_acc:.4f}",
            'Support': cm[i, :].sum()
        })
    display(pd.DataFrame(class_accuracies))

# ============================================================================
# Cell 7: Save Model
if CUDA_AVAILABLE:
    print("\n" + "=" * 60)
    print("SAVING MODEL")
    print("=" * 60)
    
    MODEL_FILENAME = 'best_cuda_classifier.pkl'
    
    try:
        model_package = {
            'model': best_model,
            'vectorizer': tfidf,
            'label_encoder': le,
            'model_name': best_model_name,
            'accuracy': best_accuracy,
            'classes': list(le.classes_)
        }
        
        with open(MODEL_FILENAME, 'wb') as file:
            pickle.dump(model_package, file)
        
        print(f"✓ Model saved successfully to '{MODEL_FILENAME}'")
        print(f"  Model: {best_model_name}")
        print(f"  Accuracy: {best_accuracy:.4f}")
        print(f"  Classes: {list(le.classes_)}")
        
        # Show how to load
        print("\n📖 To load the model later:")
        print("   with open('best_cuda_classifier.pkl', 'rb') as f:")
        print("       model_package = pickle.load(f)")
        
    except Exception as e:
        print(f"✗ Error saving model: {e}")

    print("\n" + "=" * 60)
    print("✓ ALL TASKS COMPLETE!")


SyntaxError: invalid syntax. Perhaps you forgot a comma? (3743925167.py, line 524)

In [7]:
# For CUDA 11.x
conda install -c rapidsai -c conda-forge -c nvidia \
    cuml cupy cudatoolkit=11.8

# OR for CUDA 12.x
conda install -c rapidsai -c conda-forge -c nvidia \
    cuml cupy cuda-version=12.0

SyntaxError: invalid syntax (676106818.py, line 2)