## Setup

In [1]:
# Import necessary libraries
import os
import sys
import json
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import time
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Create directories
os.makedirs('models', exist_ok=True)

## 1. Exploring the Synthetic Mention Corpora

In [None]:
# Let's explore the Synthetic Mention Corpora for Disease Entity Recognition

import os
import pandas as pd
import json

# Create data directory if it doesn't exist
os.makedirs('data/synthetic_mentions', exist_ok=True)

# Function to download the dataset
def download_synthetic_mentions():
    """
    Download the Synthetic Mention Corpora for Disease Entity Recognition
    
    Note: You need to manually download this dataset from PhysioNet:
    https://physionet.org/content/synthetic-mention-corpora/
    
    After downloading, place the files in the data/synthetic_mentions directory
    """
    mentions_data_path = 'data/synthetic_mentions/disease_mentions.json'
    
    if os.path.exists(mentions_data_path):
        print(f"Loading Synthetic Mention Corpora from {mentions_data_path}")
        with open(mentions_data_path, 'r') as f:
            data = json.load(f)
        return data
    else:
        print(f"Synthetic Mention Corpora not found at {mentions_data_path}")
        print("Please download the dataset from PhysioNet:")
        print("https://physionet.org/content/synthetic-mention-corpora/")
        print("After downloading, place the files in the data/synthetic_mentions directory")
        return None

# Try to load the dataset
mentions_data = download_synthetic_mentions()

# If the dataset is loaded successfully, convert to text for training
if mentions_data is not None:
    # Extract mentions and combine into a single text
    mentions_text = ""
    for item in mentions_data[:1000]:  # Start with a subset for exploration
        if "mention" in item:
            mentions_text += item["mention"] + "\n"
        if "context" in item:
            mentions_text += item["context"] + "\n\n"
    
    # Print some statistics
    print(f"Total characters: {len(mentions_text)}")
    print(f"Total words: {len(mentions_text.split())}")
    print(f"Total lines: {len(mentions_text.splitlines())}")
    
    # Print the first few lines
    print("\nFirst few lines:")
    for i, line in enumerate(mentions_text.splitlines()[:5]):
        print(f"{i+1}: {line}")
    
    # Check if the data is suitable for training
    if len(mentions_text) < 100000:  # Less than 100KB
        print("\nWarning: The extracted text might be too small for effective training.")
        print("Consider using more entries from the dataset.")
    else:
        print("\nThe extracted text seems suitable for training.")
    
    # Save the combined text for training
    with open('data/processed/mentions_text.txt', 'w') as f:
        f.write(mentions_text)
    print("Saved combined text to data/processed/mentions_text.txt")
else:
    # Fallback to open_db.txt if the dataset is not available
    print("Falling back to open_db.txt for training")
    
    def read_open_db():
        """Read the open database text file"""
        with open('open_db.txt', 'r') as f:
            text = f.read()
        return text
    
    # Read the open database text
    mentions_text = read_open_db()
    
    # Print some statistics
    print(f"Total characters: {len(mentions_text)}")
    print(f"Total words: {len(mentions_text.split())}")
    print(f"Total lines: {len(mentions_text.splitlines())}")