In [5]:
import sys, os
proj_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # notebooks/ -> project root
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)
print("project root added to sys.path:", proj_root)

project root added to sys.path: c:\Tony\GMU\AIT 726\Project\doj-press-release-nlp


In [6]:
import sys, subprocess, importlib

print("kernel python:", sys.executable)

def ensure(pkg):
    try:
        importlib.import_module(pkg)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# install spaCy and small model into this kernel
ensure("spacy")
# download small English model if missing
import spacy
try:
    spacy.load("en_core_web_sm")
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

print("spaCy OK:", spacy.__version__)


kernel python: c:\Users\tonyl\anaconda3\python.exe
spaCy OK: 3.8.9


In [7]:

import typing as t
from dataclasses import dataclass
import pandas as pd
import spacy
from spacy.tokens import DocBin

@dataclass
class ExampleAnn:
    start: int
    end: int
    label: str

class CustomDataset:
    """
    Lightweight dataset wrapper expected by the notebook.
    Expects a DataFrame with a text column (default 'text') and
    a spans column (default 'spans') where spans is a list of
    {"start": int, "end": int, "label": str} or tuples (start,end,label).
    """

    def __init__(self, df: pd.DataFrame, text_col: str = "text", spans_col: str = "spans"):
        self.df = df.reset_index(drop=True)
        self.text_col = text_col
        self.spans_col = spans_col

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        text = row.get(self.text_col, "") if isinstance(row, (pd.Series, dict)) else ""
        raw_spans = row.get(self.spans_col, []) if isinstance(row, (pd.Series, dict)) else []
        
        # Handle NaN or non-list spans
        if not isinstance(raw_spans, (list, tuple)):
            raw_spans = []
        
        entities: t.List[t.Tuple[int, int, str]] = []
        if raw_spans:
            for s in raw_spans:
                if isinstance(s, dict):
                    start = s.get("start")
                    end = s.get("end")
                    label = s.get("label") or s.get("entity") or s.get("label_name")
                elif isinstance(s, (list, tuple)) and len(s) >= 3:
                    start, end, label = s[0], s[1], s[2]
                else:
                    continue
                if isinstance(start, int) and isinstance(end, int) and isinstance(label, str):
                    entities.append((start, end, label))
        return text, {"entities": entities}

    def to_spacy_docbin(self, nlp: t.Optional[spacy.language.Language] = None) -> DocBin:
        """Convert dataset to a spaCy DocBin (useful for training)."""
        if nlp is None:
            nlp = spacy.blank("en")
        db = DocBin()
        for i in range(len(self)):
            text, ann = self[i]
            doc = nlp.make_doc(text)
            spans = []
            for (start, end, label) in ann.get("entities", []):
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span is None:
                    # skip spans that don't align with tokenization
                    continue
                spans.append(span)
            doc.ents = spans
            db.add(doc)
        return db

    @classmethod
    def from_jsonl(cls, path: str, text_col: str = "text", spans_col: str = "spans"):
        """Load a DataFrame-backed dataset from a JSONL file (each line a JSON object)."""
        df = pd.read_json(path, lines=True)
        return cls(df, text_col=text_col, spans_col=spans_col)

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import spacy
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split
from src.ner.dataset import CustomDataset
from src.ner.model import NERModel
from src.ner.train import train_model
from src.evaluation import evaluate_model

# Load the raw data
data_path = '../data/raw/doj_press_releases.jsonl'
df = pd.read_json(data_path, lines=True)

# Preprocess the data
def preprocess_data(df):
    # Drop rows with missing text or spans
    df = df.dropna(subset=['text', 'spans'])
    # Filter out rows where spans is not a list
    df = df[df['spans'].apply(lambda x: isinstance(x, list))]
    return df.reset_index(drop=True)

df = preprocess_data(df)
print(f"Records after preprocessing: {len(df)}")

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Create datasets for training
train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)

print(f"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}")

# Initialize the NER model
model = NERModel()

# Train the model
train_model(model, train_dataset, val_dataset)

# Evaluate the model
metrics = evaluate_model(model, val_dataset)
print(metrics)

Records after preprocessing: 468
Train size: 374, Val size: 94
Epoch 1/10 - Loss: 3439.9531
Epoch 2/10 - Loss: 2349.2573
Epoch 3/10 - Loss: 1894.6348
Epoch 4/10 - Loss: 1621.7538
Epoch 5/10 - Loss: 1491.2950
Epoch 6/10 - Loss: 1342.3236
Epoch 7/10 - Loss: 1238.6257
Epoch 8/10 - Loss: 1249.5037
Epoch 9/10 - Loss: 1082.3400
Epoch 10/10 - Loss: 1056.1290
{'precision': 0.1618045846162823, 'recall': 0.13392857142857142, 'f1': 0.14368536269428422}


# Modeling Notebook

This notebook is dedicated to building and training the NLP model for identifying entities in Department of Justice press releases. The model will be trained on the processed data and evaluated for its performance.