In [2]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [3]:
class DataPreprocessor:
    def __init__(self):
        # Download required NLTK data
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    
    def load_cord19_data(self, data_path):
        """
        Load CORD-19 dataset from the specified path
        """
        papers = []
        for filename in os.listdir(data_path):
            if filename.endswith('.json'):
                with open(os.path.join(data_path, filename), 'r') as f:
                    paper = json.load(f)
                    # Extract relevant fields
                    processed_paper = {
                        'paper_id': paper.get('paper_id', ''),
                        'title': paper.get('metadata', {}).get('title', ''),
                        'abstract': paper.get('abstract', [{}])[0].get('text', ''),
                        'body_text': ' '.join([section.get('text', '') 
                                             for section in paper.get('body_text', [])])
                    }
                    papers.append(processed_paper)
        
        return pd.DataFrame(papers)
    
    def clean_text(self, text):
        """
        Clean and preprocess text data
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenization
        tokens = word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) 
                 for token in tokens 
                 if token not in self.stop_words]
        
        return ' '.join(tokens)
    
    def prepare_dataset(self, df):
        """
        Prepare the dataset for NLP tasks
        """
        # Clean title and abstract
        df['cleaned_title'] = df['title'].fillna('').apply(self.clean_text)
        df['cleaned_abstract'] = df['abstract'].fillna('').apply(self.clean_text)
        
        # Combine title and abstract for analysis
        df['text_for_analysis'] = df['cleaned_title'] + ' ' + df['cleaned_abstract']
        
        # Remove empty entries
        df = df[df['text_for_analysis'].str.strip() != '']
        
        return df


In [4]:
def main():
    # Initialize preprocessor
    preprocessor = DataPreprocessor()
    
    # Load and preprocess data
    # Replace 'path_to_cord19_data' with your actual data path

    df = pd.read_csv('metadata.csv', low_memory=False)
    
    # Prepare dataset
    processed_df = preprocessor.prepare_dataset(df)
    
    # Split data into train and test sets
    train_df, test_df = train_test_split(processed_df, test_size=0.2, random_state=42)
    
    # Save processed data
    train_df.to_csv('processed_train_data.csv', index=False)
    test_df.to_csv('processed_test_data.csv', index=False)
    
    print(f"Processed {len(processed_df)} papers")
    print(f"Training set size: {len(train_df)}")
    print(f"Test set size: {len(test_df)}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /Users/ihack-pc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ihack-
[nltk_data]     pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ihack-
[nltk_data]     pc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processed 1056175 papers
Training set size: 844940
Test set size: 211235


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List, Dict, Any
import copy

In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, vectorizer=None):
        self.texts = texts
        if vectorizer is None:
            self.vectorizer = TfidfVectorizer(max_features=10000)
            self.features = self.vectorizer.fit_transform(texts)
        else:
            self.vectorizer = vectorizer
            self.features = self.vectorizer.transform(texts)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.features[idx].toarray()).squeeze()

In [7]:
class SimpleTextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, output_dim=2):
        super(SimpleTextClassifier, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.layer3 = nn.Linear(hidden_dim // 2, output_dim)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = self.dropout(x)
        x = F.relu(self.layer2(x))
        x = self.dropout(x)
        return self.layer3(x)

In [8]:
class PrivacyEngine:
    def __init__(self, epsilon=1.0, delta=1e-5):
        self.epsilon = epsilon  # Privacy budget
        self.delta = delta     # Privacy delta
        
    def add_noise(self, gradients: torch.Tensor) -> torch.Tensor:
        """Add Gaussian noise for differential privacy"""
        sensitivity = 1.0  # Assuming normalized gradients
        noise_scale = np.sqrt(2 * np.log(1.25/self.delta)) / self.epsilon
        noise = torch.normal(0, sensitivity * noise_scale, gradients.shape)
        return gradients + noise.to(gradients.device)

In [9]:
class FederatedClient:
    def __init__(self, model, dataset, client_id, learning_rate=0.001):
        self.client_id = client_id
        self.model = copy.deepcopy(model)
        self.dataset = dataset
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.privacy_engine = PrivacyEngine()
        
    def train(self, epochs=1, batch_size=32):
        """Train the model on local data"""
        dataloader = DataLoader(self.dataset, batch_size=batch_size, shuffle=True)
        self.model.train()
        
        for epoch in range(epochs):
            total_loss = 0
            for batch in dataloader:
                self.optimizer.zero_grad()
                outputs = self.model(batch)
                # Note: This is a placeholder loss for demonstration
                # You'll need to modify this based on your specific task
                loss = F.mse_loss(outputs, torch.zeros_like(outputs))  
                loss.backward()
                
                # Apply differential privacy
                for param in self.model.parameters():
                    if param.grad is not None:
                        param.grad = self.privacy_engine.add_noise(param.grad)
                
                self.optimizer.step()
                total_loss += loss.item()
                
        return total_loss / len(dataloader)
    
    def get_model_updates(self):
        """Get model parameters for aggregation"""
        return {name: param.data.clone() 
                for name, param in self.model.named_parameters()}

In [10]:
class FederatedServer:
    def __init__(self, model, num_clients):
        self.global_model = model
        self.num_clients = num_clients
        
    def aggregate_models(self, client_updates: List[Dict[str, torch.Tensor]]):
        """Aggregate model updates from clients using FedAvg"""
        aggregated_params = {}
        for name, param in self.global_model.named_parameters():
            # Initialize with zeros
            aggregated_params[name] = torch.zeros_like(param.data)
            
            # Sum up client parameters
            for client_update in client_updates:
                aggregated_params[name] += client_update[name]
            
            # Average the parameters
            aggregated_params[name] = aggregated_params[name] / len(client_updates)
            
            # Update global model
            param.data = aggregated_params[name]
        
    def distribute_model(self):
        """Get the current global model for distribution to clients"""
        return copy.deepcopy(self.global_model)

In [11]:
def create_federated_environment(processed_df, num_clients=5):
    """Set up the federated learning environment"""
    # Create the global model
    vectorizer = TfidfVectorizer(max_features=10000)
    vectorizer.fit(processed_df['text_for_analysis'])
    input_dim = len(vectorizer.get_feature_names_out())
    global_model = SimpleTextClassifier(input_dim=input_dim)
    
    # Split data among clients
    client_data_indices = np.array_split(range(len(processed_df)), num_clients)
    
    # Create clients
    clients = []
    for i, indices in enumerate(client_data_indices):
        client_texts = processed_df['text_for_analysis'].iloc[indices].values
        client_dataset = TextDataset(client_texts, vectorizer)
        client = FederatedClient(global_model, client_dataset, f"Client_{i}")
        clients.append(client)
    
    # Create server
    server = FederatedServer(global_model, num_clients)
    
    return server, clients, vectorizer

In [12]:
def train_federated_model(server, clients, num_rounds=10):
    """Train the model using federated learning"""
    for round_num in range(num_rounds):
        print(f"Federated Learning Round {round_num + 1}/{num_rounds}")
        
        # Train each client locally
        client_updates = []
        for client in clients:
            loss = client.train(epochs=1)
            print(f"Client {client.client_id} - Loss: {loss:.4f}")
            client_updates.append(client.get_model_updates())
        
        # Aggregate updates on the server
        server.aggregate_models(client_updates)
        
        # Distribute updated model to clients
        updated_model = server.distribute_model()
        for client in clients:
            client.model = copy.deepcopy(updated_model)

In [13]:
def main():
    # Load your preprocessed data
    processed_df = pd.read_csv('processed_train_data.csv')
    
    # Create federated learning environment
    server, clients, vectorizer = create_federated_environment(processed_df)
    
    # Train the federated model
    train_federated_model(server, clients)
    
    # Save the final model and vectorizer
    torch.save(server.global_model.state_dict(), 'federated_model.pth')
    import pickle
    with open('vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)

if __name__ == "__main__":
    main()

  processed_df = pd.read_csv('processed_train_data.csv')


Federated Learning Round 1/10
Client Client_0 - Loss: 0.0072
Client Client_1 - Loss: 0.0044
Client Client_2 - Loss: 0.0060
Client Client_3 - Loss: 0.0073
Client Client_4 - Loss: 0.0041
Federated Learning Round 2/10
Client Client_0 - Loss: 0.0005
Client Client_1 - Loss: 0.0005
Client Client_2 - Loss: 0.0005
Client Client_3 - Loss: 0.0005
Client Client_4 - Loss: 0.0005
Federated Learning Round 3/10
Client Client_0 - Loss: 0.0005
Client Client_1 - Loss: 0.0005
Client Client_2 - Loss: 0.0005
Client Client_3 - Loss: 0.0005
Client Client_4 - Loss: 0.0005
Federated Learning Round 4/10
Client Client_0 - Loss: 0.0005
Client Client_1 - Loss: 0.0005
Client Client_2 - Loss: 0.0005
Client Client_3 - Loss: 0.0005
Client Client_4 - Loss: 0.0005
Federated Learning Round 5/10
Client Client_0 - Loss: 0.0005
Client Client_1 - Loss: 0.0005
Client Client_2 - Loss: 0.0005
Client Client_3 - Loss: 0.0005
Client Client_4 - Loss: 0.0005
Federated Learning Round 6/10
Client Client_0 - Loss: 0.0005
Client Client_

In [22]:
def main():
    try:
        # Load preprocessed data
        processed_df = pd.read_csv('processed_train_data.csv', low_memory=False)
        test_df = pd.read_csv('processed_test_data.csv', low_memory=False)
        
        print("Data loaded successfully")
        print(f"Training samples: {len(processed_df)}")
        print(f"Test samples: {len(test_df)}")
        
        # Initialize and fit vectorizer on training data
        vectorizer = TfidfVectorizer(max_features=10000)
        vectorizer.fit(processed_df['text_for_analysis'].values)  # Fit vectorizer first
        
        print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")
        
        # Create datasets
        train_dataset = TextDataset(processed_df['text_for_analysis'].values, vectorizer)
        test_dataset = TextDataset(test_df['text_for_analysis'].values, vectorizer)
        
        # Initialize model
        input_dim = len(vectorizer.get_feature_names_out())
        model = SimpleTextClassifier(input_dim=input_dim)
        
        print(f"\nModel initialized with input dimension: {input_dim}")
        
        # Evaluate model
        metrics, privacy_metrics = evaluate_model_performance(model, test_dataset, vectorizer)
        
        # Save results
        results = {
            'metrics': metrics,
            'privacy_metrics': privacy_metrics,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'model_config': {
                'input_dim': input_dim,
                'vocabulary_size': len(vectorizer.get_feature_names_out())
            }
        }
        
        # Create results directory
        results_dir = f'evaluation_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
        os.makedirs(results_dir, exist_ok=True)
        
        # Save results
        with open(f'{results_dir}/evaluation_results.json', 'w') as f:
            json.dump(results, f, indent=4)
        
        # Save visualization
        metrics['confusion_matrix_path'] = f'{results_dir}/confusion_matrix.png'
        plot_confusion_matrix(metrics, save_path=metrics['confusion_matrix_path'])
        
        print(f"\nEvaluation completed. Results saved in '{results_dir}'")
        print("\nKey metrics:")
        print(f"Average Loss: {metrics['average_loss']:.4f}")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Privacy Loss: {privacy_metrics['privacy_loss']:.4f}")
        
        return results, metrics, privacy_metrics
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        logging.error(f"Error in main execution: {str(e)}", exc_info=True)
        raise

def plot_confusion_matrix(metrics: Dict, save_path: str):
    """Helper function to plot confusion matrix"""
    plt.figure(figsize=(10, 8))
    cm = np.array(metrics['confusion_matrix'])
    sns.heatmap(cm, 
               annot=True, 
               fmt='d', 
               cmap='Blues',
               xticklabels=['Class 0', 'Class 1'],
               yticklabels=['Class 0', 'Class 1'])
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(save_path)
    plt.close()

class TextDataset(Dataset):
    def __init__(self, texts, vectorizer):
        """
        Initialize dataset with texts and fitted vectorizer
        """
        self.texts = texts
        self.vectorizer = vectorizer
        self.features = self.vectorizer.transform(texts)
        print(f"Dataset created with {len(texts)} samples")
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.features[idx].toarray()).squeeze()

def evaluate_model_performance(model, test_data, vectorizer):
    """
    Helper function to evaluate model performance
    """
    evaluator = ModelEvaluator(model, vectorizer)
    metrics = evaluator.evaluate_model(test_data)
    privacy_metrics = evaluator.evaluate_privacy(epsilon=1.0)
    
    return metrics, privacy_metrics

if __name__ == "__main__":
    # Set up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
    try:
        results, metrics, privacy_metrics = main()
    except Exception as e:
        logging.error("Evaluation failed with error:", exc_info=True)
        raise

Data loaded successfully
Training samples: 844940
Test samples: 211235
Vocabulary size: 10000
Dataset created with 844940 samples
Dataset created with 211235 samples

Model initialized with input dimension: 10000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Evaluation completed. Results saved in 'evaluation_results_20241110_125550'

Key metrics:
Average Loss: 0.0052
Accuracy: 0.0000
Privacy Loss: 0.6321


In [26]:
def explore_and_prepare_data():
    """
    Explore the dataset and prepare labels for training
    """
    try:
        # Load data
        processed_df = pd.read_csv('processed_train_data.csv', low_memory=False)
        
        print("Available columns in the dataset:")
        print(processed_df.columns.tolist())
        
        # Create labels based on the data
        # For demonstration, let's create binary labels based on text length
        # You should modify this based on your specific needs
        processed_df['text_length'] = processed_df['text_for_analysis'].str.len()
        median_length = processed_df['text_length'].median()
        processed_df['label'] = (processed_df['text_length'] > median_length).astype(int)
        
        print("\nLabel distribution:")
        print(processed_df['label'].value_counts(normalize=True))
        
        return processed_df
        
    except Exception as e:
        print(f"Error in data preparation: {str(e)}")
        raise

def main():
    try:
        # Explore and prepare data
        processed_df = explore_and_prepare_data()
        
        # Initialize trainer with the correct label column
        trainer = FederatedTrainer(
            processed_df=processed_df,
            label_column='label',  # Now using the label column we created
            n_clients=5
        )
        
        # Train model
        model, metrics = trainer.train_federated_model(
            n_rounds=10,
            local_epochs=2,
            batch_size=32
        )
        
        print("\nTraining completed!")
        print(f"Final Validation Accuracy: {metrics['val_accuracies'][-1]:.4f}")
        
        return trainer, model, metrics
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        logging.error(f"Error in main execution: {str(e)}", exc_info=True)
        raise

if __name__ == "__main__":
    # First, let's explore the data
    processed_df = explore_and_prepare_data()
    print("\nData exploration completed. Review the columns above and press Enter to continue with training.")
    
    # Wait for user input
    input()
    
    # Proceed with training
    trainer, model, metrics = main()

Available columns in the dataset:
['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id', 'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files', 'url', 's2_id', 'cleaned_title', 'cleaned_abstract', 'text_for_analysis']

Label distribution:
label
0    0.500168
1    0.499832
Name: proportion, dtype: float64

Data exploration completed. Review the columns above and press Enter to continue with training.


 


Available columns in the dataset:
['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id', 'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files', 'url', 's2_id', 'cleaned_title', 'cleaned_abstract', 'text_for_analysis']

Label distribution:
label
0    0.500168
1    0.499832
Name: proportion, dtype: float64

Federated Learning Round 1/10
Client 0 - Epoch 1/2 - Loss: 0.3315
Client 0 - Epoch 2/2 - Loss: 0.2403
Client 1 - Epoch 1/2 - Loss: 0.3313
Client 1 - Epoch 2/2 - Loss: 0.2437
Client 2 - Epoch 1/2 - Loss: 0.3326
Client 2 - Epoch 2/2 - Loss: 0.2459
Client 3 - Epoch 1/2 - Loss: 0.3315
Client 3 - Epoch 2/2 - Loss: 0.2445
Client 4 - Epoch 1/2 - Loss: 0.3332
Client 4 - Epoch 2/2 - Loss: 0.2409
Round 1 - Validation Loss: 0.3809, Accuracy: 0.8210

Federated Learning Round 2/10
Client 0 - Epoch 1/2 - Loss: 0.2725
Client 0 - Epoch 2/2 - Loss: 0.1584
Client 1 - Epoch 1/2 - Loss: 0.2732
Client 1 - Ep

In [27]:
def explore_dataset():
    """
    Explore the CORD-19 dataset and show relevant information
    """
    try:
        # Load data
        print("Loading dataset...")
        processed_df = pd.read_csv('processed_train_data.csv', low_memory=False)
        
        print("\nDataset Overview:")
        print(f"Number of samples: {len(processed_df)}")
        print("\nAvailable columns:")
        for col in processed_df.columns:
            print(f"- {col}")
            
        # Look at some sample text
        print("\nSample text from the dataset:")
        print(processed_df['text_for_analysis'].iloc[0][:200], "...")
        
        return processed_df
    
    except Exception as e:
        print(f"Error during data exploration: {str(e)}")
        raise

def create_topic_labels(df):
    """
    Create labels based on text content/topics
    """
    # Define keywords for different topics
    topics = {
        'treatment': ['treatment', 'therapeutic', 'therapy', 'drug', 'medicine'],
        'diagnosis': ['diagnosis', 'testing', 'detection', 'screening'],
        'transmission': ['transmission', 'spread', 'infectious', 'contagious'],
        'symptoms': ['symptoms', 'clinical', 'manifestations', 'signs'],
        'prevention': ['prevention', 'protective', 'measures', 'precautions']
    }
    
    # Create label columns
    for topic, keywords in topics.items():
        df[f'is_{topic}'] = df['text_for_analysis'].str.contains('|'.join(keywords), case=False).astype(int)
    
    # Create a single label (you can modify this based on your needs)
    df['primary_topic'] = df[['is_' + topic for topic in topics.keys()]].idxmax(axis=1)
    df['primary_topic'] = df['primary_topic'].str.replace('is_', '')
    
    print("\nLabel Distribution:")
    print(df['primary_topic'].value_counts(normalize=True))
    
    return df

def analyze_and_prepare():
    """
    Analyze the dataset and prepare it for training
    """
    try:
        # Load and explore data
        df = explore_dataset()
        
        # Create labels
        df = create_topic_labels(df)
        
        print("\nData preparation completed!")
        print("\nYou can now proceed with training using 'primary_topic' as your label column.")
        
        return df
    
    except Exception as e:
        print(f"Error in analysis and preparation: {str(e)}")
        raise

if __name__ == "__main__":
    # Run exploration and preparation
    processed_df = analyze_and_prepare()
    
    print("\nDo you want to:")
    print("1. Proceed with training using these labels")
    print("2. Modify the labeling approach")
    print("3. Explore the data further")
    
    choice = input("Enter your choice (1-3): ")
    
    if choice == '1':
        trainer = FederatedTrainer(
            processed_df=processed_df,
            label_column='primary_topic',
            n_clients=5
        )
        
        model, metrics = trainer.train_federated_model(
            n_rounds=10,
            local_epochs=2,
            batch_size=32
        )
        
        print("\nTraining completed!")
        print(f"Final Validation Accuracy: {metrics['val_accuracies'][-1]:.4f}")
    
    elif choice == '2':
        print("\nPlease modify the create_topic_labels function to adjust the labeling approach.")
    
    else:
        print("\nExploring data further...")
        print("\nSample texts by topic:")
        for topic in processed_df['primary_topic'].unique():
            print(f"\n--- {topic.upper()} example ---")
            sample_text = processed_df[processed_df['primary_topic'] == topic]['text_for_analysis'].iloc[0][:200]
            print(sample_text, "...")

Loading dataset...

Dataset Overview:
Number of samples: 844940

Available columns:
- cord_uid
- sha
- source_x
- title
- doi
- pmcid
- pubmed_id
- license
- abstract
- publish_time
- authors
- journal
- mag_id
- who_covidence_id
- arxiv_id
- pdf_json_files
- pmc_json_files
- url
- s2_id
- cleaned_title
- cleaned_abstract
- text_for_analysis

Sample text from the dataset:
new european approval pemigatinib cholangiocarcinoma fgfr fusion rearrangement  ...

Label Distribution:
primary_topic
treatment       0.727563
diagnosis       0.097951
transmission    0.096879
symptoms        0.055981
prevention      0.021625
Name: proportion, dtype: float64

Data preparation completed!

You can now proceed with training using 'primary_topic' as your label column.

Do you want to:
1. Proceed with training using these labels
2. Modify the labeling approach
3. Explore the data further


Enter your choice (1-3):  s



Exploring data further...

Sample texts by topic:

--- TREATMENT example ---
new european approval pemigatinib cholangiocarcinoma fgfr fusion rearrangement  ...

--- TRANSMISSION example ---
optimizing spatiotemporal allocation covid vaccine italy case study sarscov vaccine distribution campaign underway across world community face challenge fair effective distribution limited supply wond ...

--- DIAGNOSIS example ---
sarscov testing north carolina racial ethnic geographic disparity sarscov testing data north carolina first three month state covid pandemic analyzed determine disparity among intersecting ax identity ...

--- PREVENTION example ---
implementation lung protective ventilation order improve adherence low tidal volume ventilation reaim evaluation purpose lung protective ventilation lpv defined tidal volume vt cckg predicted body wei ...

--- SYMPTOMS example ---
covid vaccine revisited oralmucosal vector system potential vaccine platform several emerging strategy vaccinati

In [31]:
def train_client(self, global_model, dataset, client_id, epochs, batch_size):
    """Train model on client data"""
    print(f"\nTraining Client {client_id + 1}")
    
    # Create client model as a copy of global model
    client_model = type(global_model)(
        input_dim=global_model.layer1.in_features,
        hidden_dim=global_model.layer2.in_features,
        output_dim=global_model.layer3.out_features
    ).to(self.device)
    client_model.load_state_dict(global_model.state_dict())
    
    optimizer = optim.Adam(client_model.parameters())
    criterion = nn.CrossEntropyLoss()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    client_model.train()
    total_loss = 0
    
    for epoch in range(epochs):
        epoch_loss = 0
        for batch_idx, (features, labels) in enumerate(dataloader):
            features = features.to(self.device)
            labels = labels.to(self.device)
            
            optimizer.zero_grad()
            outputs = client_model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            
            if batch_idx % 10 == 0:
                print(f"Epoch {epoch + 1}/{epochs} - Batch {batch_idx}/{len(dataloader)} - Loss: {loss.item():.4f}")
        
        avg_epoch_loss = epoch_loss / len(dataloader)
        total_loss += avg_epoch_loss
        print(f"Client {client_id + 1} - Epoch {epoch + 1}/{epochs} - Average Loss: {avg_epoch_loss:.4f}")
    
    return client_model, total_loss / epochs

def evaluate_model(self, model, dataset, batch_size):
    """Evaluate model performance"""
    model.eval()
    criterion = nn.CrossEntropyLoss()
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for features, labels in dataloader:
            features = features.to(self.device)
            labels = labels.to(self.device)
            
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    
    return avg_loss, accuracy


def plot_metrics(self, metrics_history):
    """Plot training metrics"""
    print("\nGenerating training metrics plots...")
    
    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot losses
    ax1.plot(metrics_history['train_losses'], 'b-', label='Training Loss')
    ax1.plot(metrics_history['val_losses'], 'r-', label='Validation Loss')
    ax1.set_title('Training and Validation Loss')
    ax1.set_xlabel('Round')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)
    
    # Plot accuracy
    ax2.plot(metrics_history['val_accuracies'], 'g-', label='Validation Accuracy')
    ax2.set_title('Validation Accuracy')
    ax2.set_xlabel('Round')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True)
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig(f'{self.save_dir}/plots/training_metrics.png')
    plt.close()
    
    print(f"Plots saved to {self.save_dir}/plots/training_metrics.png")

def save_checkpoint(self, model, metrics, round_num):
    """Save training checkpoint"""
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'metrics': metrics,
        'round_num': round_num,
        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    
    checkpoint_path = f'{self.save_dir}/checkpoints/round_{round_num}.pth'
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")

def save_final_results(self, model, metrics_history):
    """Save final model and results"""
    # Save model
    model_path = f'{self.save_dir}/final_model.pth'
    torch.save(model.state_dict(), model_path)
    print(f"Final model saved: {model_path}")
    
    # Save vectorizer
    vectorizer_path = f'{self.save_dir}/vectorizer.pkl'
    with open(vectorizer_path, 'wb') as f:
        pickle.dump(self.vectorizer, f)
    print(f"Vectorizer saved: {vectorizer_path}")
    
    # Save label encoder
    encoder_path = f'{self.save_dir}/label_encoder.pkl'
    with open(encoder_path, 'wb') as f:
        pickle.dump(self.label_encoder, f)
    print(f"Label encoder saved: {encoder_path}")
    
    # Save metrics
    metrics_path = f'{self.save_dir}/training_metrics.json'
    with open(metrics_path, 'w') as f:
        # Convert numpy values to native Python types for JSON serialization
        processed_metrics = {
            key: [float(val) for val in values]
            for key, values in metrics_history.items()
        }
        json.dump(processed_metrics, f, indent=4)
    print(f"Training metrics saved: {metrics_path}")

# Add these methods to your FederatedTrainingPipeline class
FederatedTrainingPipeline.plot_metrics = plot_metrics
FederatedTrainingPipeline.save_checkpoint = save_checkpoint
FederatedTrainingPipeline.save_final_results = save_final_results

# Add these methods to your FederatedTrainingPipeline class
FederatedTrainingPipeline.train_client = train_client
FederatedTrainingPipeline.evaluate_model = evaluate_model

def main():
    try:
        print("Starting Federated Learning Pipeline...")
        
        # Initialize pipeline
        pipeline = FederatedTrainingPipeline()
        
        # Train model with smaller batches and more frequent updates
        model, metrics = pipeline.train_model(
            n_clients=3,  # Reduced number of clients for faster training
            n_rounds=5,   # Reduced number of rounds for testing
            local_epochs=2,
            batch_size=16  # Smaller batch size
        )
        
        print("\nTraining completed successfully!")
        print(f"Final Validation Accuracy: {metrics['val_accuracies'][-1]:.4f}")
        print(f"Results saved in: {pipeline.save_dir}")
        
        # Plot final results
        pipeline.plot_metrics(metrics)
        
        return pipeline, model, metrics
        
    except Exception as e:
        print(f"Pipeline failed with error: {str(e)}")
        raise


if __name__ == "__main__":
    try:
        pipeline, model, metrics = main()
    except Exception as e:
        print(f"Training failed with error: {str(e)}")
        raise

Starting Federated Learning Pipeline...
Using device: cpu
Loading and preparing data...
Training samples: 675952
Validation samples: 168988

Label distribution:
primary_topic
treatment       0.727563
diagnosis       0.097951
transmission    0.096879
symptoms        0.055981
prevention      0.021625
Name: proportion, dtype: float64
Input dimension: 10000
Number of classes: 5
Creating 3 client datasets...
Client 1 dataset size: 225317
Client 2 dataset size: 225317
Client 3 dataset size: 225318

Federated Learning Round 1/5

Training Client 1
Epoch 1/2 - Batch 0/14083 - Loss: 1.5812
Epoch 1/2 - Batch 10/14083 - Loss: 1.4707
Epoch 1/2 - Batch 20/14083 - Loss: 1.4151
Epoch 1/2 - Batch 30/14083 - Loss: 1.0210
Epoch 1/2 - Batch 40/14083 - Loss: 1.1690
Epoch 1/2 - Batch 50/14083 - Loss: 0.9366
Epoch 1/2 - Batch 60/14083 - Loss: 0.9495
Epoch 1/2 - Batch 70/14083 - Loss: 1.0558
Epoch 1/2 - Batch 80/14083 - Loss: 0.6626
Epoch 1/2 - Batch 90/14083 - Loss: 1.4277
Epoch 1/2 - Batch 100/14083 - Loss:

In [33]:
def find_latest_training_dir():
    """Find the most recent training directory"""
    training_dirs = [d for d in os.listdir('.') if d.startswith('federated_training_')]
    if not training_dirs:
        raise FileNotFoundError("No training directories found! Please run training first.")
    
    latest_dir = max(training_dirs)
    print(f"Found latest training directory: {latest_dir}")
    return latest_dir

def verify_model_files(training_dir):
    """Verify all required model files exist"""
    required_files = {
        'model': 'final_model.pth',
        'vectorizer': 'vectorizer.pkl',
        'label_encoder': 'label_encoder.pkl'
    }
    
    missing_files = []
    file_paths = {}
    
    for key, filename in required_files.items():
        file_path = os.path.join(training_dir, filename)
        if not os.path.exists(file_path):
            missing_files.append(filename)
        else:
            file_paths[key] = file_path
    
    if missing_files:
        raise FileNotFoundError(
            f"Missing required files in {training_dir}: {', '.join(missing_files)}"
        )
    
    return file_paths

def main():
    """Run the complete evaluation pipeline"""
    try:
        print("Looking for trained model files...")
        
        # Find latest training directory
        training_dir = find_latest_training_dir()
        
        # Verify and get file paths
        file_paths = verify_model_files(training_dir)
        
        print("\nFound all required model files:")
        for key, path in file_paths.items():
            print(f"{key}: {path}")
        
        # Load test data
        test_df = pd.read_csv('processed_test_data.csv', low_memory=False)
        print(f"\nLoaded test data: {len(test_df)} samples")
        
        # Initialize evaluator
        evaluator = ModelEvaluator(
            model_path=file_paths['model'],
            vectorizer_path=file_paths['vectorizer'],
            label_encoder_path=file_paths['label_encoder']
        )
        
        # Run evaluation
        metrics = evaluator.evaluate(test_df)
        
        print("\nEvaluation completed successfully!")
        print(f"Results saved in: {evaluator.results_dir}")
        
        return evaluator, metrics
        
    except FileNotFoundError as e:
        print(f"\nError: {str(e)}")
        print("\nPlease ensure you have:")
        print("1. Run the training pipeline first")
        print("2. Have all model files in the training directory")
        print("3. Have the test data file available")
        raise
    except Exception as e:
        print(f"\nEvaluation failed with error: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        # First, let's check what files and directories we have
        print("Current directory contents:")
        print(os.listdir('.'))
        
        # Run evaluation
        evaluator, metrics = main()
        
    except Exception as e:
        print(f"\nScript failed with error: {str(e)}")
        
        # Print some debugging information
        print("\nDebugging information:")
        print("1. Current working directory:", os.getcwd())
        print("2. Available directories:", [d for d in os.listdir('.') if os.path.isdir(d)])
        print("3. Available files:", [f for f in os.listdir('.') if os.path.isfile(f)])

Current directory contents:
['client_5_data.csv', 'client_4_data.csv', 'confusion_matrix.png', 'Privacy_Preserving_NLP_through_Federated_Learning_for_Medical_Text_Analysis_II.ipynb', 'model_evaluation_20241110_021242.log', 'federated_training_20241110_125850', 'document_parses', '123.ipynb', 'server.py', 'preprocessed_metadata.csv', 'best_params_20241108_140624.yaml', 'Kaggle', 'federated_model.pth', 'metadata.readme', 'Untitled.ipynb', 'Project.ipynb', 'cord_19_embeddings', 'parameter_importance_20241108_140624.png', 'Untitled123.ipynb', 'federated_training_20241110_135338', 'client.py', 'performance_report.json', 'evaluation_results_20241110_125550', 'client_2_data.csv', 'client_3_data.csv', 'json_schema.txt', '__pycache__', 'client_8_data.csv', 'client_9_data.csv', 'mlruns', 'federated_training_20241110_130148', 'model_evaluation_20241107_231538.log', 'processed_test_data.csv', 'federated_training_20241110_135855', 'federated_training_20241110_142938', 'utils.py', 'GoogleNews-vector

  self.model.load_state_dict(torch.load(model_path))


In [42]:
def calculate_metrics(self, true_labels, predictions, probabilities):
    """Calculate various performance metrics"""
    print("\nCalculating performance metrics...")
    
    try:
        # Basic classification metrics
        accuracy = np.mean(predictions == true_labels)
        class_report = classification_report(
            true_labels,
            predictions,
            target_names=self.label_encoder.classes_,
            output_dict=True
        )
        
        # Confusion matrix
        conf_matrix = confusion_matrix(true_labels, predictions)
        
        # Calculate precision, recall, and F1 for each class
        class_metrics = {}
        for i, class_name in enumerate(self.label_encoder.classes_):
            class_true = (true_labels == i)
            class_pred = (predictions == i)
            
            true_pos = np.sum(class_true & class_pred)
            false_pos = np.sum(~class_true & class_pred)
            false_neg = np.sum(class_true & ~class_pred)
            
            precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
            recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            
            class_metrics[class_name] = {
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'support': np.sum(class_true)
            }
        
        # Compile all metrics
        metrics = {
            'accuracy': float(accuracy),
            'class_report': class_report,
            'confusion_matrix': conf_matrix.tolist(),
            'class_metrics': class_metrics,
            'average_metrics': {
                'precision': np.mean([m['precision'] for m in class_metrics.values()]),
                'recall': np.mean([m['recall'] for m in class_metrics.values()]),
                'f1_score': np.mean([m['f1_score'] for m in class_metrics.values()])
            }
        }
        
        # Print summary metrics
        print("\nPerformance Summary:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Average Precision: {metrics['average_metrics']['precision']:.4f}")
        print(f"Average Recall: {metrics['average_metrics']['recall']:.4f}")
        print(f"Average F1-Score: {metrics['average_metrics']['f1_score']:.4f}")
        
        return metrics
        
    except Exception as e:
        print(f"Error calculating metrics: {str(e)}")
        raise

def generate_visualizations(self, true_labels, predictions, probabilities):
    """Generate visualization plots"""
    print("\nGenerating visualizations...")
    
    try:
        # 1. Confusion Matrix
        plt.figure(figsize=(12, 8))
        cm = confusion_matrix(true_labels, predictions)
        sns.heatmap(
            cm,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=self.label_encoder.classes_,
            yticklabels=self.label_encoder.classes_
        )
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'{self.results_dir}/plots/confusion_matrix.png')
        plt.close()
        
        # 2. Class-wise Performance
        class_metrics = {}
        for i, class_name in enumerate(self.label_encoder.classes_):
            class_true = (true_labels == i)
            class_pred = (predictions == i)
            
            true_pos = np.sum(class_true & class_pred)
            false_pos = np.sum(~class_true & class_pred)
            false_neg = np.sum(class_true & ~class_pred)
            
            precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
            recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            
            class_metrics[class_name] = {
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            }
        
        # Create class performance plot
        plt.figure(figsize=(12, 6))
        metrics_df = pd.DataFrame(class_metrics).T
        metrics_df.plot(kind='bar')
        plt.title('Class-wise Performance Metrics')
        plt.ylabel('Score')
        plt.xlabel('Class')
        plt.xticks(rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.savefig(f'{self.results_dir}/plots/class_performance.png')
        plt.close()
        
        # 3. Prediction Probability Distribution
        plt.figure(figsize=(10, 6))
        for i, class_name in enumerate(self.label_encoder.classes_):
            class_probs = probabilities[:, i]
            plt.hist(class_probs, bins=30, alpha=0.5, label=class_name)
        plt.title('Prediction Probability Distribution')
        plt.xlabel('Probability')
        plt.ylabel('Count')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f'{self.results_dir}/plots/probability_distribution.png')
        plt.close()
        
        print("Visualizations saved in plots directory")
        
    except Exception as e:
        print(f"Error generating visualizations: {str(e)}")
        raise

def save_results(self, metrics):
    """Save evaluation results"""
    try:
        # Save metrics as JSON
        metrics_path = f'{self.results_dir}/evaluation_metrics.json'
        with open(metrics_path, 'w') as f:
            # Convert numpy types to native Python types for JSON serialization
            metrics_json = json.dumps(metrics, default=lambda x: x.item() if hasattr(x, 'item') else x, indent=4)
            f.write(metrics_json)
        
        print(f"Results saved to: {metrics_path}")
        
    except Exception as e:
        print(f"Error saving results: {str(e)}")
        raise

# Add these methods to the ModelEvaluator class
ModelEvaluator.calculate_metrics = calculate_metrics
ModelEvaluator.generate_visualizations = generate_visualizations
ModelEvaluator.save_results = save_results

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import json
import pickle

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, vectorizer):
        self.texts = texts
        self.labels = labels
        self.vectorizer = vectorizer
        self.features = self.vectorizer.transform(texts)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        feature = torch.FloatTensor(self.features[idx].toarray()).squeeze()
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return feature, label

# Model class
class SimpleTextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, output_dim=5):
        super(SimpleTextClassifier, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.layer3 = nn.Linear(hidden_dim // 2, output_dim)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.dropout(x)
        x = torch.relu(self.layer2(x))
        x = self.dropout(x)
        return self.layer3(x)

# Evaluator class
class ModelEvaluator:
    def __init__(self, model_path, vectorizer_path, label_encoder_path, device=None):
        """Initialize the model evaluator"""
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.setup_environment()
        self.load_model(model_path)
        self.load_preprocessing_tools(vectorizer_path, label_encoder_path)
    
    def setup_environment(self):
        """Setup evaluation environment"""
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.results_dir = f'evaluation_results_{self.timestamp}'
        os.makedirs(self.results_dir, exist_ok=True)
        os.makedirs(f'{self.results_dir}/plots', exist_ok=True)
        print(f"Results will be saved in: {self.results_dir}")
    
    def load_model(self, model_path):
        """Load the trained model"""
        try:
            print(f"Loading model from {model_path}")
            self.model = SimpleTextClassifier(
                input_dim=10000,
                hidden_dim=256,
                output_dim=5
            ).to(self.device)
            
            state_dict = torch.load(model_path)
            self.model.load_state_dict(state_dict)
            self.model.eval()
            print("Model loaded successfully")
            
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            raise
    
    def load_preprocessing_tools(self, vectorizer_path, label_encoder_path):
        """Load vectorizer and label encoder"""
        try:
            print(f"Loading vectorizer from {vectorizer_path}")
            with open(vectorizer_path, 'rb') as f:
                self.vectorizer = pickle.load(f)
                
            print(f"Loading label encoder from {label_encoder_path}")
            with open(label_encoder_path, 'rb') as f:
                self.label_encoder = pickle.load(f)
                
            print("Preprocessing tools loaded successfully")
            
        except Exception as e:
            print(f"Error loading preprocessing tools: {str(e)}")
            raise

    def prepare_test_data(self, test_df):
        """Prepare test data with labels"""
        print("\nPreparing test data...")
        
        topics = {
            'treatment': ['treatment', 'therapeutic', 'therapy', 'drug', 'medicine'],
            'diagnosis': ['diagnosis', 'testing', 'detection', 'screening'],
            'transmission': ['transmission', 'spread', 'infectious', 'contagious'],
            'symptoms': ['symptoms', 'clinical', 'manifestations', 'signs'],
            'prevention': ['prevention', 'protective', 'measures', 'precautions']
        }
        
        for topic, keywords in topics.items():
            test_df[f'is_{topic}'] = test_df['text_for_analysis'].str.contains(
                '|'.join(keywords), case=False).astype(int)
        
        test_df['primary_topic'] = test_df[
            ['is_' + topic for topic in topics.keys()]].idxmax(axis=1)
        test_df['primary_topic'] = test_df['primary_topic'].str.replace('is_', '')
        
        test_df['encoded_labels'] = self.label_encoder.transform(test_df['primary_topic'])
        
        print(f"Prepared {len(test_df)} test samples")
        print("\nLabel distribution:")
        print(test_df['primary_topic'].value_counts(normalize=True))
        
        return test_df
    
    def calculate_metrics(self, true_labels, predictions, probabilities):
        """Calculate various performance metrics"""
        print("\nCalculating performance metrics...")
        
        try:
            accuracy = np.mean(predictions == true_labels)
            class_report = classification_report(
                true_labels,
                predictions,
                target_names=self.label_encoder.classes_,
                output_dict=True
            )
            
            conf_matrix = confusion_matrix(true_labels, predictions)
            
            class_metrics = {}
            for i, class_name in enumerate(self.label_encoder.classes_):
                class_true = (true_labels == i)
                class_pred = (predictions == i)
                
                true_pos = np.sum(class_true & class_pred)
                false_pos = np.sum(~class_true & class_pred)
                false_neg = np.sum(class_true & ~class_pred)
                
                precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
                recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
                f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
                
                class_metrics[class_name] = {
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1,
                    'support': np.sum(class_true)
                }
            
            metrics = {
                'accuracy': float(accuracy),
                'class_report': class_report,
                'confusion_matrix': conf_matrix.tolist(),
                'class_metrics': class_metrics,
                'average_metrics': {
                    'precision': np.mean([m['precision'] for m in class_metrics.values()]),
                    'recall': np.mean([m['recall'] for m in class_metrics.values()]),
                    'f1_score': np.mean([m['f1_score'] for m in class_metrics.values()])
                }
            }
            
            print("\nPerformance Summary:")
            print(f"Accuracy: {metrics['accuracy']:.4f}")
            print(f"Average Precision: {metrics['average_metrics']['precision']:.4f}")
            print(f"Average Recall: {metrics['average_metrics']['recall']:.4f}")
            print(f"Average F1-Score: {metrics['average_metrics']['f1_score']:.4f}")
            
            return metrics
            
        except Exception as e:
            print(f"Error calculating metrics: {str(e)}")
            raise
    
    def generate_visualizations(self, true_labels, predictions, probabilities):
        """Generate visualization plots"""
        print("\nGenerating visualizations...")
        
        try:
            # Confusion Matrix
            plt.figure(figsize=(12, 8))
            cm = confusion_matrix(true_labels, predictions)
            sns.heatmap(
                cm,
                annot=True,
                fmt='d',
                cmap='Blues',
                xticklabels=self.label_encoder.classes_,
                yticklabels=self.label_encoder.classes_
            )
            plt.title('Confusion Matrix')
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')
            plt.tight_layout()
            plt.savefig(f'{self.results_dir}/plots/confusion_matrix.png')
            plt.close()
            
            # Class Performance
            class_metrics = pd.DataFrame({
                label: {
                    'Precision': metrics['precision'],
                    'Recall': metrics['recall'],
                    'F1-Score': metrics['f1_score']
                }
                for label, metrics in self.calculate_metrics(
                    true_labels, predictions, probabilities
                )['class_metrics'].items()
            }).T
            
            plt.figure(figsize=(12, 6))
            class_metrics.plot(kind='bar')
            plt.title('Class-wise Performance Metrics')
            plt.ylabel('Score')
            plt.xlabel('Class')
            plt.xticks(rotation=45)
            plt.legend()
            plt.tight_layout()
            plt.savefig(f'{self.results_dir}/plots/class_performance.png')
            plt.close()
            
            print("Visualizations saved in plots directory")
            
        except Exception as e:
            print(f"Error generating visualizations: {str(e)}")
            raise
    
    def evaluate(self, test_df, batch_size=32):
        """Evaluate model on test data"""
        try:
            print("\nStarting model evaluation...")
            
            test_df = self.prepare_test_data(test_df)
            
            test_dataset = TextDataset(
                texts=test_df['text_for_analysis'].values,
                labels=test_df['encoded_labels'].values,
                vectorizer=self.vectorizer
            )
            
            test_loader = DataLoader(
                test_dataset,
                batch_size=batch_size,
                shuffle=False
            )
            
            all_predictions = []
            all_labels = []
            all_probabilities = []
            
            print("\nRunning predictions...")
            with torch.no_grad():
                for features, labels in test_loader:
                    features = features.to(self.device)
                    outputs = self.model(features)
                    probabilities = torch.softmax(outputs, dim=1)
                    predictions = torch.argmax(outputs, dim=1)
                    
                    all_predictions.extend(predictions.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())
                    all_probabilities.extend(probabilities.cpu().numpy())
            
            metrics = self.calculate_metrics(
                np.array(all_labels),
                np.array(all_predictions),
                np.array(all_probabilities)
            )
            
            self.generate_visualizations(
                np.array(all_labels),
                np.array(all_predictions),
                np.array(all_probabilities)
            )
            
            return metrics, test_df
            
        except Exception as e:
            print(f"Error during evaluation: {str(e)}")
            raise

def main():
    """Run the complete evaluation pipeline"""
    try:
        print("Starting evaluation pipeline...")
        
        # Load test data
        test_df = pd.read_csv('processed_test_data.csv', low_memory=False)
        print(f"Loaded {len(test_df)} test samples")
        
        # Get the latest training directory
        training_dirs = [d for d in os.listdir() if d.startswith('federated_training_')]
        latest_training_dir = max(training_dirs)
        print(f"Using training results from: {latest_training_dir}")
        
        # Initialize evaluator
        
        evaluator = ModelEvaluator(
            model_path='federated_training_20241110_142938/final_model.pth',
            vectorizer_path='federated_training_20241110_142938/vectorizer.pkl',
            label_encoder_path='federated_training_20241110_142938/label_encoder.pkl'
        )
        # Run evaluation
        print("\nRunning evaluation...")
        metrics, processed_test_df = evaluator.evaluate(test_df)
        
        print("\nEvaluation completed successfully!")
        print(f"Results saved in: {evaluator.results_dir}")
        
        return evaluator, metrics, processed_test_df
        
    except Exception as e:
        print(f"Evaluation failed with error: {str(e)}")
        raise

if __name__ == "__main__":
    evaluator, metrics, processed_test_df = main()

Starting evaluation pipeline...
Loaded 211235 test samples
Using training results from: federated_training_20241110_142938
Results will be saved in: evaluation_results_20241110_161633
Loading model from federated_training_20241110_142938/final_model.pth
Model loaded successfully
Loading vectorizer from federated_training_20241110_142938/vectorizer.pkl
Loading label encoder from federated_training_20241110_142938/label_encoder.pkl
Preprocessing tools loaded successfully

Running evaluation...

Starting model evaluation...

Preparing test data...


  state_dict = torch.load(model_path)


Prepared 211235 test samples

Label distribution:
primary_topic
treatment       0.728691
diagnosis       0.098317
transmission    0.095600
symptoms        0.056113
prevention      0.021280
Name: proportion, dtype: float64

Running predictions...

Calculating performance metrics...

Performance Summary:
Accuracy: 0.9902
Average Precision: 0.9819
Average Recall: 0.9716
Average F1-Score: 0.9767

Generating visualizations...

Calculating performance metrics...

Performance Summary:
Accuracy: 0.9902
Average Precision: 0.9819
Average Recall: 0.9716
Average F1-Score: 0.9767
Visualizations saved in plots directory

Evaluation completed successfully!
Results saved in: evaluation_results_20241110_161633


<Figure size 1200x600 with 0 Axes>

In [52]:
def main():
    """Run complete pipeline"""
    try:
        # Step 1: Run evaluation and save results
        print("Step 1: Running evaluation and saving results...")
        evaluator, metrics, test_df, metrics_file = run_evaluation_and_save()
        
        # Get evaluation results directory
        results_dir = os.path.dirname(metrics_file)
        model_dir = evaluator.model_dir
        
        # Step 2: Run analysis
        print("\nStep 2: Running analysis...")
        analyzer = ModelAnalyzer(
            model_dir=model_dir,
            results_dir=results_dir
        )
        analysis_report = analyzer.generate_analysis_report()
        
        print("\nAnalysis Summary:")
        print(f"Overall Accuracy: {analysis_report['model_performance']['accuracy']:.4f}")
        
        if 'recommendations' in analysis_report:
            print("\nRecommendations:")
            for rec in analysis_report['recommendations']:
                print(f"- {rec['description']}")
        
        # Step 3: Deploy model
        print("\nStep 3: Initializing model deployment...")
        deployment = ModelDeployment(model_dir)
        
        print("\nStarting API server...")
        print("API will be available at http://localhost:5000/predict")
        deployment.run_server()
        
        return evaluator, analyzer, deployment
        
    except Exception as e:
        print(f"Pipeline failed with error: {str(e)}")
        raise

class ModelAnalyzer:
    def __init__(self, model_dir, results_dir):
        """Initialize the model analyzer"""
        self.model_dir = model_dir
        self.results_dir = results_dir
        self.metrics = self.load_results()
        self.label_encoder = self.load_label_encoder()
    
    def load_results(self):
        """Load evaluation results"""
        try:
            metrics_file = os.path.join(self.results_dir, 'evaluation_metrics.json')
            print(f"Loading metrics from: {metrics_file}")
            
            with open(metrics_file, 'r') as f:
                metrics = json.load(f)
            
            print("Evaluation results loaded successfully")
            return metrics
            
        except Exception as e:
            print(f"Error loading results: {str(e)}")
            raise
    
    def load_label_encoder(self):
        """Load label encoder"""
        try:
            encoder_file = os.path.join(self.model_dir, 'label_encoder.pkl')
            print(f"Loading label encoder from: {encoder_file}")
            
            with open(encoder_file, 'rb') as f:
                label_encoder = pickle.load(f)
            
            return label_encoder
            
        except Exception as e:
            print(f"Error loading label encoder: {str(e)}")
            raise
    
    def analyze_errors(self):
        """Analyze model errors and patterns"""
        error_analysis = {}
        
        # Get confusion matrix
        conf_matrix = np.array(self.metrics['confusion_matrix'])
        
        # Analyze each class
        for i, true_class in enumerate(self.label_encoder.classes_):
            misclassifications = {}
            total_samples = np.sum(conf_matrix[i, :])
            correct_predictions = conf_matrix[i, i]
            
            # Analyze misclassifications
            for j, pred_class in enumerate(self.label_encoder.classes_):
                if i != j and conf_matrix[i, j] > 0:
                    misclassifications[pred_class] = {
                        'count': int(conf_matrix[i, j]),
                        'percentage': float(conf_matrix[i, j] / total_samples * 100)
                    }
            
            error_analysis[true_class] = {
                'total_samples': int(total_samples),
                'correct_predictions': int(correct_predictions),
                'accuracy': float(correct_predictions / total_samples),
                'misclassifications': misclassifications
            }
        
        return error_analysis
    
    def generate_recommendations(self):
        """Generate improvement recommendations"""
        recommendations = []
        error_analysis = self.analyze_errors()
        
        # Check for class imbalance
        class_samples = [info['total_samples'] for info in error_analysis.values()]
        max_samples = max(class_samples)
        min_samples = min(class_samples)
        
        if max_samples / min_samples > 2:
            recommendations.append({
                'type': 'class_imbalance',
                'severity': 'high' if max_samples / min_samples > 5 else 'medium',
                'description': 'Significant class imbalance detected. Consider data augmentation or balanced sampling.'
            })
        
        # Check for poor performing classes
        for class_name, info in error_analysis.items():
            if info['accuracy'] < 0.7:
                recommendations.append({
                    'type': 'low_performance',
                    'class': class_name,
                    'accuracy': info['accuracy'],
                    'description': f'Low accuracy ({info["accuracy"]:.2%}) for class "{class_name}". Consider collecting more training data.'
                })
        
        # Check for consistent misclassifications
        for true_class, info in error_analysis.items():
            for pred_class, misclass_info in info['misclassifications'].items():
                if misclass_info['percentage'] > 20:
                    recommendations.append({
                        'type': 'confusion',
                        'classes': (true_class, pred_class),
                        'percentage': misclass_info['percentage'],
                        'description': f'High confusion ({misclass_info["percentage"]:.1f}%) between {true_class} and {pred_class}.'
                    })
        
        return recommendations
    
    def generate_analysis_report(self):
        """Generate comprehensive analysis report"""
        print("Generating analysis report...")
        
        error_analysis = self.analyze_errors()
        recommendations = self.generate_recommendations()
        
        report = {
            'model_performance': {
                'accuracy': self.metrics['accuracy'],
                'average_metrics': self.metrics['average_metrics'],
                'class_metrics': self.metrics['class_metrics']
            },
            'error_analysis': error_analysis,
            'recommendations': recommendations,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        # Save report
        report_file = os.path.join(self.results_dir, 'analysis_report.json')
        with open(report_file, 'w') as f:
            json.dump(report, f, indent=4)
        
        print(f"Analysis report saved to: {report_file}")
        return report

# Make sure these imports are at the top of your file
import os
import json
import pickle
import numpy as np
from datetime import datetime

In [57]:
class ModelAnalyzer:
    def __init__(self, model_dir, results_dir):
        """Initialize the model analyzer"""
        self.model_dir = model_dir
        self.results_dir = results_dir
        print(f"Initializing analyzer with:")
        print(f"- Model directory: {model_dir}")
        print(f"- Results directory: {results_dir}")
        
        # Load required data
        self.load_data()
    
    def load_data(self):
        """Load necessary data for analysis"""
        try:
            # Load metrics
            metrics_path = os.path.join(self.results_dir, 'evaluation_metrics.json')
            print(f"Loading metrics from: {metrics_path}")
            with open(metrics_path, 'r') as f:
                self.metrics = json.load(f)
            
            # Load label encoder
            encoder_path = os.path.join(self.model_dir, 'label_encoder.pkl')
            print(f"Loading label encoder from: {encoder_path}")
            with open(encoder_path, 'rb') as f:
                self.label_encoder = pickle.load(f)
            
            print("Data loaded successfully")
            
        except FileNotFoundError as e:
            raise RuntimeError(f"Required file not found: {str(e)}")
        except Exception as e:
            raise RuntimeError(f"Error loading data: {str(e)}")
    
    def generate_analysis_report(self):
        """Generate comprehensive analysis report"""
        try:
            print("\nGenerating analysis report...")
            
            # Basic performance metrics
            report = {
                'model_performance': {
                    'accuracy': self.metrics['accuracy'],
                    'average_metrics': self.metrics['average_metrics']
                },
                'class_performance': {}
            }
            
            # Analyze per-class performance
            for class_name in self.label_encoder.classes_:
                if class_name in self.metrics['class_metrics']:
                    class_metrics = self.metrics['class_metrics'][class_name]
                    report['class_performance'][class_name] = {
                        'precision': class_metrics['precision'],
                        'recall': class_metrics['recall'],
                        'f1_score': class_metrics['f1_score']
                    }
            
            # Generate recommendations
            report['recommendations'] = self.generate_recommendations()
            
            # Save report
            report_path = os.path.join(self.results_dir, 'analysis_report.json')
            with open(report_path, 'w') as f:
                json.dump(report, f, indent=4)
            
            print(f"Analysis report saved to: {report_path}")
            return report
            
        except Exception as e:
            print(f"Error generating analysis report: {str(e)}")
            raise
    
    def generate_recommendations(self):
        """Generate improvement recommendations"""
        recommendations = []
        
        # Check class balance
        class_counts = {
            class_name: metrics.get('support', 0)
            for class_name, metrics in self.metrics['class_metrics'].items()
        }
        
        if class_counts:
            max_count = max(class_counts.values())
            min_count = min(class_counts.values())
            
            if max_count / min_count > 2:
                recommendations.append({
                    'type': 'class_imbalance',
                    'description': 'Significant class imbalance detected. Consider data augmentation or balanced sampling.',
                    'ratio': max_count / min_count
                })
        
        # Check performance thresholds
        for class_name, metrics in self.metrics['class_metrics'].items():
            if metrics['f1_score'] < 0.7:
                recommendations.append({
                    'type': 'low_performance',
                    'class': class_name,
                    'description': f'Low F1-score ({metrics["f1_score"]:.2f}) for class {class_name}. Consider collecting more training data or feature engineering.'
                })
        
        return recommendations

In [59]:
def convert_to_serializable(obj):
    """Convert numpy types to Python native types"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    return obj

def save_evaluation_metrics(evaluator, metrics, test_df):
    """Save evaluation metrics and results"""
    try:
        # Create results directory if it doesn't exist
        os.makedirs('evaluation_results', exist_ok=True)
        
        # Convert metrics to serializable format
        metrics_json = convert_to_serializable({
            'accuracy': metrics['accuracy'],
            'class_report': metrics['class_report'],
            'confusion_matrix': metrics['confusion_matrix'],
            'class_metrics': metrics['class_metrics'],
            'average_metrics': metrics['average_metrics']
        })
        
        # Save metrics to JSON
        metrics_file = 'evaluation_results/evaluation_metrics.json'
        with open(metrics_file, 'w') as f:
            json.dump(metrics_json, f, indent=4)
        
        print(f"Metrics saved to {metrics_file}")
        
        # Save processed test data
        test_data_file = 'evaluation_results/processed_test_data.csv'
        test_df.to_csv(test_data_file, index=False)
        print(f"Processed test data saved to {test_data_file}")
        
        return metrics_file
        
    except Exception as e:
        print(f"Error saving metrics: {str(e)}")
        raise

def run_evaluation_and_save():
    """Run evaluation and save results"""
    try:
        print("Starting evaluation pipeline...")
        
        # Load test data
        test_df = pd.read_csv('processed_test_data.csv', low_memory=False)
        print(f"Loaded {len(test_df)} test samples")
        
        # Get the latest training directory
        training_dirs = [d for d in os.listdir() if d.startswith('federated_training_')]
        latest_training_dir = max(training_dirs)
        print(f"Using training results from: {latest_training_dir}")
        
        # Initialize evaluator
        evaluator = ModelEvaluator(
            model_path=f'{latest_training_dir}/final_model.pth',
            vectorizer_path=f'{latest_training_dir}/vectorizer.pkl',
            label_encoder_path=f'{latest_training_dir}/label_encoder.pkl'
        )
        
        # Run evaluation
        print("\nRunning evaluation...")
        metrics, processed_test_df = evaluator.evaluate(test_df)
        
        # Save results
        metrics_file = save_evaluation_metrics(evaluator, metrics, processed_test_df)
        
        print("\nEvaluation and saving completed successfully!")
        
        # Print summary metrics
        print("\nEvaluation Summary:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print("\nClass-wise Performance:")
        for class_name, class_metrics in metrics['class_metrics'].items():
            print(f"\n{class_name}:")
            print(f"Precision: {class_metrics['precision']:.4f}")
            print(f"Recall: {class_metrics['recall']:.4f}")
            print(f"F1-score: {class_metrics['f1_score']:.4f}")
        
        return evaluator, metrics, processed_test_df, metrics_file
        
    except Exception as e:
        print(f"Evaluation failed with error: {str(e)}")
        raise

def main():
    """Run complete pipeline"""
    try:
        # Run evaluation and save results
        print("Step 1: Running evaluation and saving results...")
        evaluator, metrics, test_df, metrics_file = run_evaluation_and_save()
        
        # Initialize analyzer
        print("\nStep 2: Running analysis...")
        analyzer = ModelAnalyzer('evaluation_results')
        analysis_report = analyzer.generate_analysis_report()
        
        print("\nAnalysis Summary:")
        print(f"Overall Accuracy: {analysis_report['model_performance']['accuracy']:.4f}")
        print("\nRecommendations:")
        for rec in analysis_report['recommendations']:
            print(f"- {rec['description']}")
        
        # Initialize deployment
        print("\nStep 3: Initializing model deployment...")
        deployment = ModelDeployment(evaluator.model_dir)
        
        print("\nStarting API server...")
        print("API will be available at http://localhost:5000/predict")
        deployment.run_server()
        
        return evaluator, analyzer, deployment
        
    except Exception as e:
        print(f"Pipeline failed with error: {str(e)}")
        raise

if __name__ == "__main__":
    evaluator, analyzer, deployment = main()

Step 1: Running evaluation and saving results...
Starting evaluation pipeline...
Loaded 211235 test samples
Using training results from: federated_training_20241110_142938
Results will be saved in: evaluation_results_20241110_163743
Loading model from federated_training_20241110_142938/final_model.pth
Model loaded successfully
Loading vectorizer from federated_training_20241110_142938/vectorizer.pkl
Loading label encoder from federated_training_20241110_142938/label_encoder.pkl
Preprocessing tools loaded successfully

Running evaluation...

Starting model evaluation...

Preparing test data...


  state_dict = torch.load(model_path)


Prepared 211235 test samples

Label distribution:
primary_topic
treatment       0.728691
diagnosis       0.098317
transmission    0.095600
symptoms        0.056113
prevention      0.021280
Name: proportion, dtype: float64

Running predictions...

Calculating performance metrics...

Performance Summary:
Accuracy: 0.9902
Average Precision: 0.9819
Average Recall: 0.9716
Average F1-Score: 0.9767

Generating visualizations...

Calculating performance metrics...

Performance Summary:
Accuracy: 0.9902
Average Precision: 0.9819
Average Recall: 0.9716
Average F1-Score: 0.9767
Visualizations saved in plots directory
Metrics saved to evaluation_results/evaluation_metrics.json
Processed test data saved to evaluation_results/processed_test_data.csv

Evaluation and saving completed successfully!

Evaluation Summary:
Accuracy: 0.9902

Class-wise Performance:

diagnosis:
Precision: 0.9886
Recall: 0.9800
F1-score: 0.9843

prevention:
Precision: 0.9615
Recall: 0.9284
F1-score: 0.9447

symptoms:
Precisio

TypeError: ModelAnalyzer.__init__() missing 1 required positional argument: 'results_dir'

<Figure size 1200x600 with 0 Axes>

In [64]:
import torch
import pandas as pd
import numpy as np
from flask import Flask, request, jsonify
import joblib
from datetime import datetime
import logging
import os

class ModelDeployment:
    def __init__(self, model_dir):
        self.model_dir = model_dir
        self.setup_logging()
        self.load_model_artifacts()
        self.initialize_api()
        
    def setup_logging(self):
        """Setup logging configuration"""
        logging.basicConfig(
            filename=f'deployment_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
    
    def load_model_artifacts(self):
        """Load model and preprocessing tools"""
        try:
            # Load model
            self.model = torch.load(f'{self.model_dir}/final_model.pth')
            self.model.eval()
            
            # Load vectorizer and label encoder
            self.vectorizer = joblib.load(f'{self.model_dir}/vectorizer.pkl')
            self.label_encoder = joblib.load(f'{self.model_dir}/label_encoder.pkl')
            
            self.logger.info("Model artifacts loaded successfully")
            
        except Exception as e:
            self.logger.error(f"Error loading model artifacts: {str(e)}")
            raise
    
    def initialize_api(self):
        """Initialize Flask API"""
        self.app = Flask(__name__)
        
        @self.app.route('/predict', methods=['POST'])
        def predict():
            try:
                # Get input text
                data = request.json
                text = data.get('text', '')
                
                # Preprocess input
                features = self.vectorizer.transform([text])
                feature_tensor = torch.FloatTensor(features.toarray())
                
                # Make prediction
                with torch.no_grad():
                    outputs = self.model(feature_tensor)
                    probabilities = torch.softmax(outputs, dim=1)
                    prediction = torch.argmax(probabilities, dim=1)
                
                # Get predicted label
                predicted_label = self.label_encoder.inverse_transform(prediction.numpy())[0]
                
                # Get probability scores
                class_probabilities = {
                    label: float(prob)
                    for label, prob in zip(self.label_encoder.classes_, probabilities[0].numpy())
                }
                
                return jsonify({
                    'prediction': predicted_label,
                    'probabilities': class_probabilities,
                    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                
            except Exception as e:
                self.logger.error(f"Prediction error: {str(e)}")
                return jsonify({'error': str(e)}), 500
    
    def run_server(self, host='0.0.0.0', port=5000):
        """Run the API server"""
        self.app.run(host=host, port=port)

class ModelAnalyzer:
    def __init__(self, model_dir, results_dir):
        self.model_dir = model_dir
        self.results_dir = results_dir
        self.load_results()
    
    def load_results(self):
        """Load evaluation results"""
        try:
            with open(f'{self.results_dir}/evaluation_metrics.json', 'r') as f:
                self.metrics = json.load(f)
                
            print("Evaluation results loaded successfully")
            
        except Exception as e:
            print(f"Error loading results: {str(e)}")
            raise
    
    def generate_analysis_report(self):
        """Generate comprehensive analysis report"""
        report = {
            'model_performance': {
                'accuracy': self.metrics['accuracy'],
                'average_metrics': self.metrics['average_metrics'],
                'class_performance': self.metrics['class_metrics']
            },
            'error_analysis': self.analyze_errors(),
            'recommendations': self.generate_recommendations(),
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        # Save report
        report_path = f'{self.results_dir}/analysis_report.json'
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=4)
            
        return report
    
    def analyze_errors(self):
        """Analyze model errors"""
        conf_matrix = np.array(self.metrics['confusion_matrix'])
        
        # Calculate error patterns
        error_analysis = {}
        for i, true_class in enumerate(self.label_encoder.classes_):
            misclassifications = {}
            for j, pred_class in enumerate(self.label_encoder.classes_):
                if i != j:
                    misclassifications[pred_class] = int(conf_matrix[i, j])
            
            error_analysis[true_class] = {
                'total_samples': int(conf_matrix[i, :].sum()),
                'correct_predictions': int(conf_matrix[i, i]),
                'misclassifications': misclassifications
            }
        
        return error_analysis
    
    def generate_recommendations(self):
        """Generate improvement recommendations"""
        recommendations = []
        
        # Analyze class imbalance
        class_supports = [m['support'] for m in self.metrics['class_metrics'].values()]
        if max(class_supports) / min(class_supports) > 2:
            recommendations.append({
                'type': 'data_balance',
                'description': 'Consider addressing class imbalance using techniques like oversampling or weighted loss'
            })
        
        # Analyze performance patterns
        for class_name, metrics in self.metrics['class_metrics'].items():
            if metrics['f1_score'] < 0.7:
                recommendations.append({
                    'type': 'class_performance',
                    'class': class_name,
                    'description': f'Low performance on class {class_name}. Consider collecting more training data or analyzing feature importance'
                })
        
        return recommendations

def main():
    """Run analysis and deployment pipeline"""
    try:
        # Get latest evaluation results
        eval_dirs = [d for d in os.listdir() if d.startswith('evaluation_results_')]
        latest_eval_dir = max(eval_dirs)
        
        # Get latest model directory
        model_dirs = [d for d in os.listdir() if d.startswith('federated_training_')]
        latest_model_dir = max(model_dirs)
        
        print(f"Using evaluation results from: {latest_eval_dir}")
        print(f"Using model from: {latest_model_dir}")
        
        # Run analysis
        analyzer = ModelAnalyzer(latest_model_dir, latest_eval_dir)
        analysis_report = analyzer.generate_analysis_report()
        
        print("\nAnalysis completed. Key findings:")
        print(f"Overall Accuracy: {analysis_report['model_performance']['accuracy']:.4f}")
        print("\nRecommendations:")
        for rec in analysis_report['recommendations']:
            print(f"- {rec['description']}")
        
        # Initialize deployment
        print("\nInitializing model deployment...")
        deployment = ModelDeployment(latest_model_dir)
        
        print("\nStarting API server...")
        deployment.run_server()
        
        return analyzer, deployment
        
    except Exception as e:
        print(f"Pipeline failed with error: {str(e)}")
        raise

if __name__ == "__main__":
    analyzer, deployment = main()

Using evaluation results from: evaluation_results_20241110_163743
Using model from: federated_training_20241110_142938
Error loading results: [Errno 2] No such file or directory: 'evaluation_results_20241110_163743/evaluation_metrics.json'
Pipeline failed with error: [Errno 2] No such file or directory: 'evaluation_results_20241110_163743/evaluation_metrics.json'


FileNotFoundError: [Errno 2] No such file or directory: 'evaluation_results_20241110_163743/evaluation_metrics.json'

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import copy
import logging

class FederatedClient:
    def __init__(self, model, dataset, client_id, learning_rate=0.001):
        self.client_id = client_id
        self.model = copy.deepcopy(model)
        self.dataset = dataset
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.privacy_engine = PrivacyEngine()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
    
    def get_model_updates(self):
        """Get model parameters for aggregation"""
        return {
            name: param.data.cpu().clone()
            for name, param in self.model.named_parameters()
        }
    
    def set_model_params(self, params_dict):
        """Update model with new parameters"""
        with torch.no_grad():
            for name, param in self.model.named_parameters():
                if name in params_dict:
                    param.data.copy_(params_dict[name].to(self.device))
    
    def train(self, epochs=1, batch_size=32):
        """Train the model on local data"""
        self.model.train()
        dataloader = DataLoader(self.dataset, batch_size=batch_size, shuffle=True)
        total_loss = 0
        num_batches = 0

        for epoch in range(epochs):
            epoch_loss = 0
            for batch in dataloader:
                batch = batch.to(self.device)
                self.optimizer.zero_grad()
                
                # Forward pass
                outputs = self.model(batch)
                loss = F.mse_loss(outputs, torch.zeros_like(outputs))
                
                # Backward pass
                loss.backward()
                
                # Apply differential privacy
                for param in self.model.parameters():
                    if param.grad is not None:
                        param.grad = self.privacy_engine.add_noise(param.grad)
                
                self.optimizer.step()
                epoch_loss += loss.item()
                num_batches += 1
                
            total_loss += epoch_loss

        avg_loss = total_loss / (num_batches * epochs)
        return avg_loss

class FederatedServer:
    def __init__(self, model, num_clients):
        self.global_model = model
        self.num_clients = num_clients
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.global_model.to(self.device)
    
    def aggregate_models(self, client_updates):
        """Aggregate model updates from clients using FedAvg"""
        aggregated_params = {}
        
        # Initialize with zeros
        for name, param in self.global_model.named_parameters():
            aggregated_params[name] = torch.zeros_like(param.data)
        
        # Sum up client parameters
        for client_update in client_updates:
            for name, param in client_update.items():
                aggregated_params[name] += param.to(self.device)
        
        # Average the parameters
        for name in aggregated_params:
            aggregated_params[name] = aggregated_params[name] / len(client_updates)
        
        # Update global model
        with torch.no_grad():
            for name, param in self.global_model.named_parameters():
                param.data.copy_(aggregated_params[name])
    
    def distribute_model(self):
        """Get the current global model for distribution to clients"""
        return copy.deepcopy(self.global_model)

def train_federated_model(server, clients, num_rounds=10, local_epochs=1):
    """Train the model using federated learning"""
    metrics_history = []
    
    try:
        for round_num in range(num_rounds):
            print(f"Federated Learning Round {round_num + 1}/{num_rounds}")
            
            # Train each client locally
            client_updates = []
            round_losses = []
            
            for client in clients:
                try:
                    # Train client
                    loss = client.train(epochs=local_epochs)
                    print(f"Client {client.client_id} - Loss: {loss:.4f}")
                    
                    # Get model updates
                    updates = client.get_model_updates()
                    client_updates.append(updates)
                    round_losses.append(loss)
                    
                except Exception as e:
                    logging.error(f"Error training client {client.client_id}: {str(e)}")
                    continue
            
            if not client_updates:
                raise Exception("No valid client updates in this round")
            
            # Aggregate updates on the server
            server.aggregate_models(client_updates)
            
            # Distribute updated model to clients
            updated_model = server.distribute_model()
            for client in clients:
                client.model = copy.deepcopy(updated_model)
            
            # Record metrics
            avg_loss = sum(round_losses) / len(round_losses)
            metrics_history.append({
                'round': round_num,
                'average_loss': avg_loss
            })
            
            print(f"Round {round_num + 1} completed - Average Loss: {avg_loss:.4f}")
        
        return metrics_history
        
    except Exception as e:
        logging.error(f"Error in federated training: {str(e)}")
        raise

def create_federated_environment(processed_df, num_clients=5):
    """Set up the federated learning environment"""
    try:
        # Create the global model
        vectorizer = TfidfVectorizer(max_features=10000)
        vectorizer.fit(processed_df['text_for_analysis'])
        input_dim = len(vectorizer.get_feature_names_out())
        global_model = SimpleTextClassifier(input_dim=input_dim)
        
        # Split data among clients
        total_samples = len(processed_df)
        samples_per_client = total_samples // num_clients
        
        clients = []
        for i in range(num_clients):
            start_idx = i * samples_per_client
            end_idx = start_idx + samples_per_client if i < num_clients - 1 else total_samples
            
            client_texts = processed_df['text_for_analysis'].iloc[start_idx:end_idx].values
            client_dataset = TextDataset(client_texts, vectorizer)
            
            client = FederatedClient(
                model=copy.deepcopy(global_model),
                dataset=client_dataset,
                client_id=f"Client_{i}"
            )
            clients.append(client)
        
        # Create server
        server = FederatedServer(global_model, num_clients)
        
        return server, clients, vectorizer
        
    except Exception as e:
        logging.error(f"Error creating federated environment: {str(e)}")
        raise

In [24]:
def run_complete_pipeline():
    """
    Run the complete pipeline: optimization and analysis
    """
    try:
        # Load data
        print("Loading datasets...")
        processed_df = pd.read_csv('processed_train_data.csv', low_memory=False)
        test_df = pd.read_csv('processed_test_data.csv', low_memory=False)
        
        print(f"Loaded {len(processed_df)} training samples and {len(test_df)} test samples")
        
        # First, run optimization
        print("\nStarting hyperparameter optimization...")
        optimizer = HyperparameterOptimizer(processed_df, test_df)
        best_params = optimizer.optimize()
        
        # Save best parameters
        with open('best_params.json', 'w') as f:
            json.dump(best_params, f, indent=4)
            print(f"Best parameters saved to 'best_params.json'")
            print(f"Best parameters: {best_params}")
        
        print("\nStarting model analysis...")
        report, metrics_history = analyze_model_performance(best_params)
        
        # Print summary
        print("\nTraining Summary:")
        print(f"Initial Loss: {metrics_history[0]['average_loss']:.4f}")
        print(f"Final Loss: {metrics_history[-1]['average_loss']:.4f}")
        print(f"Improvement: {metrics_history[0]['average_loss'] - metrics_history[-1]['average_loss']:.4f}")
        print(f"Privacy Budget (ε): {best_params['epsilon']:.2f}")
        
        return report, metrics_history
        
    except FileNotFoundError as e:
        print(f"Error: Could not find data files. Please make sure 'processed_train_data.csv' and 'processed_test_data.csv' exist.")
        print(f"Detailed error: {str(e)}")
        raise
    except Exception as e:
        print(f"Error in pipeline: {str(e)}")
        logging.error(f"Pipeline error: {str(e)}", exc_info=True)
        raise

class HyperparameterOptimizer:
    def __init__(self, processed_df, test_df, n_trials=20):
        self.processed_df = processed_df
        self.test_df = test_df
        self.n_trials = n_trials
        self.best_params = None
        self.best_score = float('-inf')
        
        # Setup experiment tracking
        try:
            mlflow.set_experiment("Federated_NLP_Optimization")
        except Exception as e:
            print(f"MLflow setup warning: {str(e)}")
            print("Continuing without MLflow tracking...")
    
    def objective(self, trial: Trial) -> float:
        """Objective function for hyperparameter optimization"""
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
            'hidden_dim': trial.suggest_categorical('hidden_dim', [128, 256, 512]),
            'num_rounds': trial.suggest_int('num_rounds', 5, 15),
            'num_clients': trial.suggest_int('num_clients', 3, 8),
            'epsilon': trial.suggest_float('epsilon', 0.1, 2.0),
            'local_epochs': trial.suggest_int('local_epochs', 1, 3)
        }
        
        try:
            print(f"\nTrial {trial.number + 1}/{self.n_trials} - Testing parameters: {params}")
            
            # Create federated environment with current parameters
            server, clients, vectorizer = create_federated_environment(
                self.processed_df,
                num_clients=params['num_clients']
            )
            
            # Update client learning rates
            for client in clients:
                client.optimizer = optim.Adam(
                    client.model.parameters(), 
                    lr=params['learning_rate']
                )
                client.privacy_engine.epsilon = params['epsilon']
            
            # Train with current parameters
            metrics_history = train_federated_model(
                server, 
                clients,
                num_rounds=params['num_rounds'],
                local_epochs=params['local_epochs']
            )
            
            # Use final loss as the objective
            final_loss = metrics_history[-1]['average_loss']
            
            print(f"Trial completed - Final Loss: {final_loss:.4f}")
            
            return -final_loss  # Negative because Optuna minimizes
            
        except Exception as e:
            print(f"Trial failed with error: {str(e)}")
            return float('-inf')
    
    def optimize(self) -> dict:
        """Run hyperparameter optimization"""
        study = optuna.create_study(direction="maximize")
        study.optimize(self.objective, n_trials=self.n_trials)
        
        self.best_params = study.best_params
        print(f"\nOptimization completed!")
        print(f"Best parameters: {self.best_params}")
        print(f"Best value: {-study.best_value:.4f}")
        
        return self.best_params

if __name__ == "__main__":
    # Set up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(f'pipeline_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
            logging.StreamHandler()
        ]
    )
    
    try:
        # Run the complete pipeline
        print("Starting complete pipeline...")
        report, metrics_history = run_complete_pipeline()
        
        print("\nPipeline completed successfully!")
        print("Check the results directory for detailed analysis and visualizations.")
        
    except Exception as e:
        logging.error("Pipeline failed with error:", exc_info=True)
        raise

NameError: name 'Trial' is not defined