In [5]:
import pandas as pd
import numpy as np 

In [3]:
data = pd.read_csv('/data2/project/2025summer/jjh0709/Data/zkwgkjkjn9-2/Gas Sensors Measurements/Gas_Sensors_Measurements.csv')

In [11]:
random = np.random.randint(0,6400,1000)
print(len(random))

1000


In [10]:
data.iloc[random].to_csv('1000_data.csv')

In [None]:

import torch
from torch.utils.data import DataLoader
from src.config import DATA_DIR_SENSOR, DATA_DIR_THERMAL, BATCH_SIZE, DEVICE, TEST_CSV_PATH
from src.dataset import GasDataset
from src.GasDataSet import *
from src.transforms import transform
from src.models.multitask_fusion_model import MultitaskFusionModel
from tqdm import tqdm
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, jaccard_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

MODEL_PATH = 'Multitask_fusion_model.pt'  

def plot_confusion_matrix(cm, class_names=None, save_path='confusion_matrix.png', title='Confusion Matrix'):
    """
    Plot and save confusion matrix with enhanced visualization
    """
    plt.figure(figsize=(12, 10))
    
    # Create heatmap with better formatting
    sns.heatmap(cm, 
                annot=True, 
                fmt='d', 
                cmap='Blues', 
                xticklabels=class_names if class_names else range(len(cm)), 
                yticklabels=class_names if class_names else range(len(cm)),
                cbar_kws={'label': 'Count'},
                square=True,
                linewidths=0.5)
    
    plt.title(title, fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')
    
    # Rotate x-axis labels if they're long
    if class_names and max(len(name) for name in class_names) > 8:
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    
    # Create directory if it doesn't exist
    save_dir = os.path.dirname(save_path) if os.path.dirname(save_path) else '.'
    os.makedirs(save_dir, exist_ok=True)
    
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"Confusion matrix saved to {save_path}")

def plot_normalized_confusion_matrix(cm, class_names=None, save_path='confusion_matrix_normalized.png'):
    """
    Plot normalized confusion matrix (percentages)
    """
    # Normalize confusion matrix
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm_normalized = np.nan_to_num(cm_normalized)  # Handle division by zero
    
    plt.figure(figsize=(12, 10))
    
    # Create heatmap for normalized matrix
    sns.heatmap(cm_normalized, 
                annot=True, 
                fmt='.2%', 
                cmap='Blues', 
                xticklabels=class_names if class_names else range(len(cm)), 
                yticklabels=class_names if class_names else range(len(cm)),
                cbar_kws={'label': 'Percentage'},
                square=True,
                linewidths=0.5,
                vmin=0,
                vmax=1)
    
    plt.title('Normalized Confusion Matrix (Percentages)', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')
    
    # Rotate x-axis labels if they're long
    if class_names and max(len(name) for name in class_names) > 8:
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    
    # Create directory if it doesn't exist
    save_dir = os.path.dirname(save_path) if os.path.dirname(save_path) else '.'
    os.makedirs(save_dir, exist_ok=True)
    
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"Normalized confusion matrix saved to {save_path}")

def plot_combined_confusion_matrix(cm, class_names=None, save_path='confusion_matrix_combined.png'):
    """
    Plot both count and percentage in the same heatmap
    """
    # Normalize confusion matrix for percentages
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm_normalized = np.nan_to_num(cm_normalized)
    
    # Create annotations with both count and percentage
    annotations = []
    for i in range(len(cm)):
        row = []
        for j in range(len(cm[i])):
            count = cm[i, j]
            percentage = cm_normalized[i, j] * 100
            row.append(f'{count}\n({percentage:.1f}%)')
        annotations.append(row)
    
    plt.figure(figsize=(14, 12))
    
    # Create heatmap
    sns.heatmap(cm, 
                annot=np.array(annotations),
                fmt='', 
                cmap='Blues', 
                xticklabels=class_names if class_names else range(len(cm)), 
                yticklabels=class_names if class_names else range(len(cm)),
                cbar_kws={'label': 'Count'},
                square=True,
                linewidths=0.5)
    
    plt.title('Confusion Matrix (Count and Percentage)', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')
    
    # Rotate x-axis labels if they're long
    if class_names and max(len(name) for name in class_names) > 8:
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    
    # Create directory if it doesn't exist
    save_dir = os.path.dirname(save_path) if os.path.dirname(save_path) else '.'
    os.makedirs(save_dir, exist_ok=True)
    
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"Combined confusion matrix saved to {save_path}")

def calculate_metrics(y_true, y_pred, num_classes):
    """
    Calculate various performance metrics
    """
    # Convert to numpy arrays if they're tensors
    if torch.is_tensor(y_true):
        y_true = y_true.cpu().numpy()
    if torch.is_tensor(y_pred):
        y_pred = y_pred.cpu().numpy()
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    jaccard = jaccard_score(y_true, y_pred, average='weighted', zero_division=0)
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'jaccard_index': jaccard,
        'confusion_matrix': cm
    }

def print_metrics(metrics):
    """
    Print formatted metrics
    """
    print("\n" + "="*50)
    print("PERFORMANCE METRICS")
    print("="*50)
    print(f"Accuracy       : {metrics['accuracy']:.4f}")
    print(f"Precision      : {metrics['precision']:.4f}")
    print(f"Recall         : {metrics['recall']:.4f}")
    print(f"F1-Score       : {metrics['f1_score']:.4f}")
    print(f"Jaccard Index  : {metrics['jaccard_index']:.4f}")
    print("="*50)

def print_class_wise_metrics(y_true, y_pred, class_names=None):
    """
    Print class-wise metrics
    """
    if torch.is_tensor(y_true):
        y_true = y_true.cpu().numpy()
    if torch.is_tensor(y_pred):
        y_pred = y_pred.cpu().numpy()
    
    precision = precision_score(y_true, y_pred, average=None, zero_division=0)
    recall = recall_score(y_true, y_pred, average=None, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=None, zero_division=0)
    
    print("\nCLASS-WISE METRICS")
    print("="*60)
    print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
    print("-"*60)
    
    for i in range(len(precision)):
        class_name = class_names[i] if class_names and i < len(class_names) else f"Class {i}"
        print(f"{class_name:<15} {precision[i]:<12.4f} {recall[i]:<12.4f} {f1[i]:<12.4f}")
    print("="*60)

def test():
    print("Loading dataset...")
    # Uncomment the appropriate dataset loader
    #test_dataset = GasDataset(DATA_DIR_THERMAL, DATA_DIR_SENSOR, transform=transform)
    test_dataset = GasDataSet(TEST_CSV_PATH, DATA_DIR_THERMAL, DATA_DIR_SENSOR, transform=transform)

    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    print("Loading model...")
    model = MultitaskFusionModel().to(DEVICE)
    model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE, weights_only=True))
    model.eval()

    # Lists to store all predictions and labels
    all_preds = []
    all_labels = []
    
    print("Testing model...")
    with torch.no_grad():
        for thermal, sensor, label in tqdm(test_loader, desc="Testing", unit="batch"):
            thermal, sensor, label = thermal.to(DEVICE), sensor.to(DEVICE), label.to(DEVICE)
            outputs = model(thermal, sensor)
            preds = outputs.argmax(dim=1)
            
            # Store predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(label.cpu().numpy())

    # Convert to numpy arrays
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    # Determine number of classes
    num_classes = len(np.unique(all_labels))
    
    # Calculate metrics
    metrics = calculate_metrics(all_labels, all_preds, num_classes)
    
    # Print overall metrics
    print_metrics(metrics)
    
    # Define class names (modify based on your dataset)
    class_names = [f"Gas_{i}" for i in range(num_classes)]
    # For actual gas types, use something like:
    # class_names = ['No Gas', 'Methane', 'Ethane', 'Propane', 'Butane', 'CO2']
    
    # Print class-wise metrics
    print_class_wise_metrics(all_labels, all_preds, class_names)
    
    # Create output directory
    output_dir = 'results'
    os.makedirs(output_dir, exist_ok=True)
    
    # Plot different versions of confusion matrix
    print("\nGenerating confusion matrix visualizations...")
    
    # 1. Standard confusion matrix (counts)
    plot_confusion_matrix(
        metrics['confusion_matrix'], 
        class_names, 
        save_path=f'{output_dir}/confusion_matrix_counts.png',
        title='Confusion Matrix (Counts)'
    )
    
    # 2. Normalized confusion matrix (percentages)
    plot_normalized_confusion_matrix(
        metrics['confusion_matrix'], 
        class_names, 
        save_path=f'{output_dir}/confusion_matrix_normalized.png'
    )
    
    # 3. Combined confusion matrix (counts and percentages)
    plot_combined_confusion_matrix(
        metrics['confusion_matrix'], 
        class_names, 
        save_path=f'{output_dir}/confusion_matrix_combined.png'
    )
    
    # Print confusion matrix in text format
    print("\nCONFUSION MATRIX")
    print("="*50)
    print("Rows: True Labels, Columns: Predicted Labels")
    print(metrics['confusion_matrix'])
    print("="*50)
    
    # Additional analysis
    print(f"\nTotal samples: {len(all_labels)}")
    print(f"Number of classes: {num_classes}")
    print(f"Class distribution:")
    unique, counts = np.unique(all_labels, return_counts=True)
    for class_idx, count in zip(unique, counts):
        class_name = class_names[class_idx] if class_idx < len(class_names) else f"Class {class_idx}"
        print(f"  {class_name}: {count} ({count/len(all_labels)*100:.1f}%)")
    
    print(f"\nAll visualization files saved in '{output_dir}/' directory")

if __name__ == '__main__':
    # Set matplotlib backend for better compatibility
    import matplotlib
    matplotlib.use('Agg')  # Use non-interactive backend
    plt.ioff()  # Turn off interactive mode
    
    test()
    
    # Show plots at the end if needed
    plt.show()

In [1]:
import torch
from torch.utils.data import DataLoader
from src.config import DATA_DIR_SENSOR, DATA_DIR_THERMAL, BATCH_SIZE, DEVICE, TEST_CSV_PATH
from src.dataset import GasDataset
from src.GasDataSet import *
from src.transforms import transform
from src.models.multitask_fusion_model import MultitaskFusionModel
from tqdm import tqdm
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

MODEL_PATH = 'Multitask_fusion_model.pt'  

def plot_confusion_matrix(cm, class_names=None, save_path='confusion_matrix.png', title='Confusion Matrix'):
    """
    Plot and save confusion matrix with enhanced visualization
    """
    plt.figure(figsize=(12, 10))
    
    # Create heatmap with better formatting
    sns.heatmap(cm, 
                annot=True, 
                fmt='d', 
                cmap='Blues', 
                xticklabels=class_names if class_names else range(len(cm)), 
                yticklabels=class_names if class_names else range(len(cm)),
                cbar_kws={'label': 'Count'},
                square=True,
                linewidths=0.5)
    
    plt.title(title, fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')
    
    # Rotate x-axis labels if they're long
    if class_names and max(len(name) for name in class_names) > 8:
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    
    # Create directory if it doesn't exist
    save_dir = os.path.dirname(save_path) if os.path.dirname(save_path) else '.'
    os.makedirs(save_dir, exist_ok=True)
    
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"Confusion matrix saved to {save_path}")

def plot_normalized_confusion_matrix(cm, class_names=None, save_path='confusion_matrix_normalized.png'):
    """
    Plot normalized confusion matrix (percentages)
    """
    # Normalize confusion matrix
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm_normalized = np.nan_to_num(cm_normalized)  # Handle division by zero
    
    plt.figure(figsize=(12, 10))
    
    # Create heatmap for normalized matrix
    sns.heatmap(cm_normalized, 
                annot=True, 
                fmt='.2%', 
                cmap='Blues', 
                xticklabels=class_names if class_names else range(len(cm)), 
                yticklabels=class_names if class_names else range(len(cm)),
                cbar_kws={'label': 'Percentage'},
                square=True,
                linewidths=0.5,
                vmin=0,
                vmax=1)
    
    plt.title('Normalized Confusion Matrix (Percentages)', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')
    
    # Rotate x-axis labels if they're long
    if class_names and max(len(name) for name in class_names) > 8:
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    
    # Create directory if it doesn't exist
    save_dir = os.path.dirname(save_path) if os.path.dirname(save_path) else '.'
    os.makedirs(save_dir, exist_ok=True)
    
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"Normalized confusion matrix saved to {save_path}")

def plot_combined_confusion_matrix(cm, class_names=None, save_path='confusion_matrix_combined.png'):
    """
    Plot both count and percentage in the same heatmap
    """
    # Normalize confusion matrix for percentages
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm_normalized = np.nan_to_num(cm_normalized)
    
    # Create annotations with both count and percentage
    annotations = []
    for i in range(len(cm)):
        row = []
        for j in range(len(cm[i])):
            count = cm[i, j]
            percentage = cm_normalized[i, j] * 100
            row.append(f'{count}\n({percentage:.1f}%)')
        annotations.append(row)
    
    plt.figure(figsize=(14, 12))
    
    # Create heatmap
    sns.heatmap(cm, 
                annot=np.array(annotations),
                fmt='', 
                cmap='Blues', 
                xticklabels=class_names if class_names else range(len(cm)), 
                yticklabels=class_names if class_names else range(len(cm)),
                cbar_kws={'label': 'Count'},
                square=True,
                linewidths=0.5)
    
    plt.title('Confusion Matrix (Count and Percentage)', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=14, fontweight='bold')
    
    # Rotate x-axis labels if they're long
    if class_names and max(len(name) for name in class_names) > 8:
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    
    # Create directory if it doesn't exist
    save_dir = os.path.dirname(save_path) if os.path.dirname(save_path) else '.'
    os.makedirs(save_dir, exist_ok=True)
    
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"Combined confusion matrix saved to {save_path}")

def calculate_metrics(y_true, y_pred, y_proba, num_classes):
    """
    Calculate various performance metrics including AUROC
    """
    # Convert to numpy arrays if they're tensors
    if torch.is_tensor(y_true):
        y_true = y_true.cpu().numpy()
    if torch.is_tensor(y_pred):
        y_pred = y_pred.cpu().numpy()
    if torch.is_tensor(y_proba):
        y_proba = y_proba.cpu().numpy()
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    # Calculate AUROC
    try:
        if num_classes == 2:
            # Binary classification
            auroc = roc_auc_score(y_true, y_proba[:, 1])
        else:
            # Multi-class classification (one-vs-rest)
            auroc = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
    except Exception as e:
        print(f"Warning: Could not calculate AUROC: {e}")
        auroc = 0.0
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auroc': auroc,
        'confusion_matrix': cm
    }

def print_metrics(metrics):
    """
    Print formatted metrics
    """
    print("\n" + "="*50)
    print("PERFORMANCE METRICS")
    print("="*50)
    print(f"Accuracy       : {metrics['accuracy']:.4f}")
    print(f"Precision      : {metrics['precision']:.4f}")
    print(f"Recall         : {metrics['recall']:.4f}")
    print(f"F1-Score       : {metrics['f1_score']:.4f}")
    print(f"AUROC          : {metrics['auroc']:.4f}")
    print("="*50)

def print_class_wise_metrics(y_true, y_pred, class_names=None):
    """
    Print class-wise metrics
    """
    if torch.is_tensor(y_true):
        y_true = y_true.cpu().numpy()
    if torch.is_tensor(y_pred):
        y_pred = y_pred.cpu().numpy()
    
    precision = precision_score(y_true, y_pred, average=None, zero_division=0)
    recall = recall_score(y_true, y_pred, average=None, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=None, zero_division=0)
    
    print("\nCLASS-WISE METRICS")
    print("="*60)
    print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
    print("-"*60)
    
    for i in range(len(precision)):
        class_name = class_names[i] if class_names and i < len(class_names) else f"Class {i}"
        print(f"{class_name:<15} {precision[i]:<12.4f} {recall[i]:<12.4f} {f1[i]:<12.4f}")
    print("="*60)

def test():
    print("Loading dataset...")
    # Uncomment the appropriate dataset loader
    #test_dataset = GasDataset(DATA_DIR_THERMAL, DATA_DIR_SENSOR, transform=transform)
    test_dataset = GasDataSet(TEST_CSV_PATH, DATA_DIR_THERMAL, DATA_DIR_SENSOR, transform=transform)

    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    print("Loading model...")
    model = MultitaskFusionModel().to(DEVICE)
    model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE, weights_only=True))
    model.eval()

    # Lists to store all predictions, probabilities, and labels
    all_preds = []
    all_proba = []
    all_labels = []
    
    print("Testing model...")
    with torch.no_grad():
        for thermal, sensor, label in tqdm(test_loader, desc="Testing", unit="batch"):
            thermal, sensor, label = thermal.to(DEVICE), sensor.to(DEVICE), label.to(DEVICE)
            outputs = model(thermal, sensor)
            
            # Get predicted classes
            preds = outputs.argmax(dim=1)
            
            # Get probabilities using softmax
            proba = torch.softmax(outputs, dim=1)
            
            # Store predictions, probabilities, and labels
            all_preds.extend(preds.cpu().numpy())
            all_proba.append(proba.cpu().numpy())
            all_labels.extend(label.cpu().numpy())

    # Convert to numpy arrays
    all_preds = np.array(all_preds)
    all_proba = np.vstack(all_proba)
    all_labels = np.array(all_labels)
    
    # Determine number of classes
    num_classes = len(np.unique(all_labels))
    
    # Calculate metrics
    metrics = calculate_metrics(all_labels, all_preds, all_proba, num_classes)
    
    # Print overall metrics
    print_metrics(metrics)
    
    # Define class names (modify based on your dataset)
    class_names = [f"Gas_{i}" for i in range(num_classes)]
    # For actual gas types, use something like:
    # class_names = ['No Gas', 'Methane', 'Ethane', 'Propane', 'Butane', 'CO2']
    
    # Print class-wise metrics
    print_class_wise_metrics(all_labels, all_preds, class_names)
    
    # Create output directory
    output_dir = 'results'
    os.makedirs(output_dir, exist_ok=True)
    
    # Plot different versions of confusion matrix
    print("\nGenerating confusion matrix visualizations...")
    
    # 1. Standard confusion matrix (counts)
    plot_confusion_matrix(
        metrics['confusion_matrix'], 
        class_names, 
        save_path=f'{output_dir}/confusion_matrix_counts.png',
        title='Confusion Matrix (Counts)'
    )
    
    # 2. Normalized confusion matrix (percentages)
    plot_normalized_confusion_matrix(
        metrics['confusion_matrix'], 
        class_names, 
        save_path=f'{output_dir}/confusion_matrix_normalized.png'
    )
    
    # 3. Combined confusion matrix (counts and percentages)
    plot_combined_confusion_matrix(
        metrics['confusion_matrix'], 
        class_names, 
        save_path=f'{output_dir}/confusion_matrix_combined.png'
    )
    
    # Print confusion matrix in text format
    print("\nCONFUSION MATRIX")
    print("="*50)
    print("Rows: True Labels, Columns: Predicted Labels")
    print(metrics['confusion_matrix'])
    print("="*50)
    
    # Additional analysis
    print(f"\nTotal samples: {len(all_labels)}")
    print(f"Number of classes: {num_classes}")
    print(f"Class distribution:")
    unique, counts = np.unique(all_labels, return_counts=True)
    for class_idx, count in zip(unique, counts):
        class_name = class_names[class_idx] if class_idx < len(class_names) else f"Class {class_idx}"
        print(f"  {class_name}: {count} ({count/len(all_labels)*100:.1f}%)")
    
    print(f"\nAll visualization files saved in '{output_dir}/' directory")

if __name__ == '__main__':
    # Set matplotlib backend for better compatibility
    import matplotlib
    matplotlib.use('Agg')  # Use non-interactive backend
    plt.ioff()  # Turn off interactive mode
    
    test()
    
    # Show plots at the end if needed
    plt.show()

Loading dataset...
Dataset loaded: 180 samples from /data2/project/2025summer/jjh0709/git/GasLeakage-MultiModal-MTF/data/TEST_DATA.csv
Loading model...
Testing model...


Testing: 100%|██████████| 23/23 [00:03<00:00,  7.35batch/s]



PERFORMANCE METRICS
Accuracy       : 0.9222
Precision      : 0.9392
Recall         : 0.9222
F1-Score       : 0.9219
AUROC          : 0.9989

CLASS-WISE METRICS
Class           Precision    Recall       F1-Score    
------------------------------------------------------------
Gas_0           1.0000       1.0000       1.0000      
Gas_1           0.7812       1.0000       0.8772      
Gas_2           1.0000       0.7667       0.8679      
Gas_3           1.0000       1.0000       1.0000      

Generating confusion matrix visualizations...
Confusion matrix saved to results/confusion_matrix_counts.png
Normalized confusion matrix saved to results/confusion_matrix_normalized.png
Combined confusion matrix saved to results/confusion_matrix_combined.png

CONFUSION MATRIX
Rows: True Labels, Columns: Predicted Labels
[[30  0  0  0]
 [ 0 50  0  0]
 [ 0 14 46  0]
 [ 0  0  0 40]]

Total samples: 180
Number of classes: 4
Class distribution:
  Gas_0: 30 (16.7%)
  Gas_1: 50 (27.8%)
  Gas_2: 60 (33.3%)

In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# ============================================
# 데이터 로드
# ============================================
train_data = pd.read_csv("open (2)//train.csv", index_col='ID')
test_data = pd.read_csv("open (2)//test.csv", index_col='ID')
sample_submission = pd.read_csv("open (2)//sample_submission.csv", index_col='ID')

print("데이터 로드 완료")
print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}\n")

# ============================================
# 특성 제거
# ============================================
drop_columns = [5,7,8,9,12,13,14,16,18,19,20,21,22,24,25,28,29,30,31,33,35,36,38,41,42,44,46,47,49,51]

for i in drop_columns:
    col_name = f"X_{i:02d}"
    train_data = train_data.drop(col_name, axis=1, errors='ignore')
    test_data = test_data.drop(col_name, axis=1, errors='ignore')

print(f"제거 후 Train shape: {train_data.shape}")
print(f"제거 후 Test shape: {test_data.shape}\n")

# ============================================
# X, Y 분리
# ============================================
X_train = train_data.drop('target', axis=1)
Y_train = train_data['target']

print("클래스 분포:")
print(Y_train.value_counts().sort_index(), "\n")

# ============================================
# PolynomialFeatures (degree=2 or 5 가능)
# ============================================
print("="*50)
print("PolynomialFeatures (degree=5)")
print("="*50)

poly = PolynomialFeatures(degree=5, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(test_data)

print(f"원본 특성 수: {X_train.shape[1]}")
print(f"Degree 5 특성 수: {X_train_poly.shape[1]}\n")

# ============================================
# 데이터 정규화
# ============================================
scaler = StandardScaler()
X_train_poly = scaler.fit_transform(X_train_poly)
X_test_poly = scaler.transform(X_test_poly)

# ============================================
# PyTorch Tensor로 변환 (GPU)
# ============================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 중인 디바이스: {device}\n")

X_train_t = torch.tensor(X_train_poly, dtype=torch.float32, device=device)
Y_train_t = torch.tensor(Y_train.values, dtype=torch.long, device=device)
X_test_t  = torch.tensor(X_test_poly, dtype=torch.float32, device=device)

# ============================================
# QDA 수식 구현
# ============================================
print("QDA 학습 중...")

classes = torch.unique(Y_train_t)
num_classes = len(classes)
num_features = X_train_t.shape[1]

means = []
covs = []
priors = []

for c in classes:
    Xc = X_train_t[Y_train_t == c]
    mean_c = Xc.mean(dim=0)
    means.append(mean_c)
    
    cov_c = torch.cov(Xc.T)
    covs.append(cov_c + 1e-6 * torch.eye(num_features, device=device))  # 안정성 위한 작은 값 추가
    
    priors.append(len(Xc) / len(X_train_t))

means = torch.stack(means)
covs = torch.stack(covs)
priors = torch.tensor(priors, device=device)

print("QDA 학습 완료!\n")

# ============================================
# 예측 함수 정의
# ============================================
def qda_predict(X):
    preds = []
    for x in X:
        scores = []
        for k in range(num_classes):
            diff = (x - means[k]).unsqueeze(0)
            inv_cov = torch.inverse(covs[k])
            term = -0.5 * torch.mm(torch.mm(diff, inv_cov), diff.T)
            score = term - 0.5 * torch.logdet(covs[k]) + torch.log(priors[k])
            scores.append(score)
        scores = torch.cat(scores).flatten()
        preds.append(torch.argmax(scores))
    return torch.stack(preds)

# ============================================
# Train 성과 평가
# ============================================
print("Train 데이터 예측 중...")
Y_train_pred_t = qda_predict(X_train_t)
Y_train_pred = Y_train_pred_t.cpu().numpy()

accuracy = accuracy_score(Y_train, Y_train_pred)
f1 = f1_score(Y_train, Y_train_pred, average='weighted')

print("="*50)
print("Train 성과")
print("="*50)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}\n")

# ============================================
# Test 데이터 예측
# ============================================
print("Test 데이터 예측 중...")
Y_test_pred_t = qda_predict(X_test_t)
predictions = Y_test_pred_t.cpu().numpy()

print("예측 완료!")
print(f"예측값 개수: {len(predictions)}\n")
print("예측 클래스 분포:")
print(pd.Series(predictions).value_counts().sort_index(), "\n")

# ============================================
# 제출 파일 저장
# ============================================
sample_submission['target'] = predictions.astype(int)
output_path = "open (2)//submission_qda_gpu.csv"
sample_submission.to_csv(output_path)

print("="*50)
print(f"✓ 완료! GPU QDA 결과 저장됨")
print(f"저장 경로: {output_path}")
print("="*50)


데이터 로드 완료
Train shape: (21693, 53)
Test shape: (15004, 52)

제거 후 Train shape: (21693, 23)
제거 후 Test shape: (15004, 22)

클래스 분포:
target
0     1033
1     1033
2     1033
3     1033
4     1033
5     1033
6     1033
7     1033
8     1033
9     1033
10    1033
11    1033
12    1033
13    1033
14    1033
15    1033
16    1033
17    1033
18    1033
19    1033
20    1033
Name: count, dtype: int64 

PolynomialFeatures (degree=5)
원본 특성 수: 22
Degree 5 특성 수: 2299

사용 중인 디바이스: cuda

QDA 학습 중...
QDA 학습 완료!

Train 데이터 예측 중...


KeyboardInterrupt: 

In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import accuracy_score, f1_score
# ⭐️ 추가된 라이브러리: DataLoader와 TensorDataset
from torch.utils.data import TensorDataset, DataLoader

# ============================================
# 데이터 로드 및 전처리 (기존과 동일)
# ============================================
train_data = pd.read_csv("open (2)//train.csv", index_col='ID')
test_data = pd.read_csv("open (2)//test.csv", index_col='ID')
sample_submission = pd.read_csv("open (2)//sample_submission.csv", index_col='ID')

drop_columns = [25,33,36,38]
for i in drop_columns:
    col_name = f"X_{i:02d}"
    train_data = train_data.drop(col_name, axis=1, errors='ignore')
    test_data = test_data.drop(col_name, axis=1, errors='ignore')

X_train = train_data.drop('target', axis=1)
Y_train = train_data['target']

poly = PolynomialFeatures(degree=4, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(test_data)
num_features = X_train_poly.shape[1] 

scaler = StandardScaler()
X_train_poly = scaler.fit_transform(X_train_poly)
X_test_poly = scaler.transform(X_test_poly)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train_t = torch.tensor(X_train_poly, dtype=torch.float32, device=device)
Y_train_t = torch.tensor(Y_train.values, dtype=torch.long, device=device)
X_test_t  = torch.tensor(X_test_poly, dtype=torch.float32, device=device)

# ============================================
# Naive QDA (대각 공분산) 학습 (기존과 동일)
# ============================================
classes = torch.unique(Y_train_t)
num_classes = len(classes)

means, vars_, priors = [], [], []

for c in classes:
    Xc = X_train_t[Y_train_t == c]
    means.append(Xc.mean(dim=0))
    vars_.append(Xc.var(dim=0) + 1e-6)
    priors.append(len(Xc) / len(X_train_t))

means = torch.stack(means) # [K, D]
vars_ = torch.stack(vars_) # [K, D]
priors = torch.tensor(priors, device=device) # [K]

# 상수항을 미리 계산
inv_vars = 1.0 / vars_ # [K, D]
log_det_term = -0.5 * torch.sum(torch.log(vars_), dim=1) # [K]
log_prior_term = torch.log(priors) # [K]

print("Naive QDA 학습 완료! (모델 파라미터 저장)")

# ============================================
# ⭐️ 배치 예측 함수 정의 ⭐️
# ============================================
def batch_predict(X_tensor, batch_size=1024):
    """
    GPU OOM을 방지하기 위해 데이터를 배치 단위로 예측합니다.
    """
    # 1. DataLoader 설정
    dataset = TensorDataset(X_tensor)
    # 메모리 상황에 따라 batch_size를 더 줄일 수 있습니다 (예: 512, 256)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_preds = []
    
    # 2. 배치별 예측 수행
    for [X_batch] in loader:
        # X_batch.shape: [B, D]
        
        # [B, D] -> [B, 1, D] - [K, D] -> [B, K, D]
        diff = (X_batch.unsqueeze(1) - means) 
        
        # Mahalanobis distance (요소별 연산)
        # [B, K, D] * [K, D] -> sum over D -> [B, K]
        mahalanobis_term = -0.5 * torch.sum(diff * diff * inv_vars, dim=2) 
        
        # 최종 점수 (로그 사후 확률에 비례)
        # [B, K] + [K] + [K] -> [B, K]
        scores = mahalanobis_term + log_det_term + log_prior_term
        
        preds = torch.argmax(scores, dim=1)
        all_preds.append(preds)
        
    # 3. 모든 배치 결과 취합
    return torch.cat(all_preds)

# ============================================
# Train 성과 평가 (배치 적용)
# ============================================
print("\nTrain 데이터 배치 예측 중...")
# ⭐️ 배치 예측 함수 사용 ⭐️
Y_train_pred_t = batch_predict(X_train_t, batch_size=1024) 
Y_train_pred = Y_train_pred_t.cpu().numpy()

accuracy = accuracy_score(Y_train, Y_train_pred)
f1 = f1_score(Y_train, Y_train_pred, average='weighted')

print("="*50)
print("Train 성과 (Naive QDA, 배치 적용)")
print("="*50)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}\n")

# ============================================
# Test 데이터 예측 (배치 적용)
# ============================================
print("Test 데이터 배치 예측 중...")
# ⭐️ 배치 예측 함수 사용 ⭐️
Y_test_pred_t = batch_predict(X_test_t, batch_size=1024)
predictions = Y_test_pred_t.cpu().numpy()

print("예측 완료!")

# ============================================
# 제출 파일 저장
# ============================================
sample_submission['target'] = predictions.astype(int)
output_path = "open (2)//submission_naive_qda_batch_gpu.csv"
sample_submission.to_csv(output_path)

print("="*50)
print(f"✓ 완료! GPU Naive QDA (배치) 결과 저장됨")
print(f"저장 경로: {output_path}")
print("="*50)

OutOfMemoryError: CUDA out of memory. Tried to allocate 15.13 GiB. GPU 0 has a total capacity of 23.59 GiB of which 464.88 MiB is free. Process 2040555 has 990.00 MiB memory in use. Including non-PyTorch memory, this process has 22.13 GiB memory in use. Of the allocated memory 21.88 GiB is allocated by PyTorch, and 1.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
import gc # 가비지 컬렉터 import

# 이전 작업에서 생성된 불필요한 변수 삭제 및 메모리 확보
del X_train_poly # NumPy 배열은 더 이상 GPU에 필요하지 않으므로 삭제해도 무방
gc.collect() 
torch.cuda.empty_cache() # ⭐️ 가장 중요한 코드: PyTorch의 GPU 캐시 비우기

In [None]:
import torch
from torch.utils.mobile_optimizer import optimize_for_mobile

model = torch.jit.load(
    "data2/project/2025summer/mym470/Gas-Detection-and-Identification-Using-Multimodal/outputs/model.pt",
    map_location="cpu"
)
model.eval()

optimized = optimize_for_mobile(model)

optimized._save_for_lite_interpreter("data2/project/2025summer/mym470/Gas-Detection-and-Identification-Using-Multimodal/outputs/model.ptl")


  model = torch.load("data2/project/2025summer/mym470/Gas-Detection-and-Identification-Using-Multimodal/outputs/model.pt", map_location="cpu")


FileNotFoundError: [Errno 2] No such file or directory: 'data2/project/2025summer/mym470/Gas-Detection-and-Identification-Using-Multimodal/outputs/model.pt'