In [1]:
import os
import pandas as pd
import fitz  # PyMuPDF
import sys

# Add the project root to the Python path to allow imports from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.data_processing.pdf_parser import extract_text_blocks
from src.data_processing.heuristics import get_document_stats
from src.data_processing.feature_engineering import create_feature_vector


In [2]:
# --- Part 1: Labeled Data Generation ---
# This part of the script creates a CSV file with features for each text block.
# You will need to manually add a 'label' column to this CSV.

PDF_DIR = '../data/raw_pdfs/'
OUTPUT_CSV = '../data/training/features_for_labeling.csv'

all_features = []

pdf_files = [f for f in os.listdir(PDF_DIR) if f.lower().endswith('.pdf')]

for pdf_file in pdf_files:
    pdf_path = os.path.join(PDF_DIR, pdf_file)
    print(f"Generating features for: {pdf_file}")
    
    blocks = extract_text_blocks(pdf_path)
    if not blocks:
        continue
        
    doc_stats = get_document_stats(blocks)
    doc = fitz.open(pdf_path)
    
    prev_block = None
    for block in blocks:
        page_num = block.get('page_num', 0)
        page = doc[page_num]
        page_width, page_height = page.rect.width, page.rect.height
        
        features = create_feature_vector(block, doc_stats, page_width, page_height, prev_block)
        
        if features:
            # Add identifiers to help with manual labeling
            text_preview = "".join(s['text'] for l in block['lines'] for s in l['spans']).strip()[:80]
            features['pdf_file'] = pdf_file
            features['page_num'] = page_num
            features['text_preview'] = text_preview
            all_features.append(features)
        
        prev_block = block
    doc.close()

# Create a DataFrame and save it for manual labeling
df_features = pd.DataFrame(all_features)
df_features.to_csv(OUTPUT_CSV, index=False)

print(f"\nFeature generation complete. Please manually label the 'label' column in '{OUTPUT_CSV}'.")
print("Add a 'label' column with values like 'H1', 'H2', 'H3', or 'Body_Text'.")


Generating features for: file03.pdf
Generating features for: file04.pdf
Generating features for: file01.pdf
Generating features for: file02.pdf
Generating features for: file05.pdf

Feature generation complete. Please manually label the 'label' column in '../data/training/features_for_labeling.csv'.
Add a 'label' column with values like 'H1', 'H2', 'H3', or 'Body_Text'.


In [3]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

# --- Part 2: Model Training ---
# This part assumes you have created 'labeled_data.csv' from the output of the previous cell.

LABELED_DATA_PATH = '../data/training/labeled_data.csv' # Manually rename the file after labeling
MODEL_DIR = '../models/'
MODEL_PATH = os.path.join(MODEL_DIR, 'lgbm_model.joblib')
ENCODER_PATH = os.path.join(MODEL_DIR, 'label_encoder.joblib')

# Create model directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True)

try:
    df = pd.read_csv(LABELED_DATA_PATH)
except FileNotFoundError:
    print(f"Error: Labeled data not found at {LABELED_DATA_PATH}")
    print("Please run the cell above, label the data, and rename the file to 'labeled_data.csv'.")
else:
    # Drop non-feature columns and rows with missing labels
    df = df.dropna(subset=['label'])
    X = df.drop(['label', 'pdf_file', 'page_num', 'text_preview'], axis=1)
    y = df['label']

    # Encode labels (e.g., 'H1' -> 0, 'H2' -> 1, 'Body_Text' -> 2)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    num_classes = len(label_encoder.classes_)

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    # Train LightGBM Model [6]
    lgbm = lgb.LGBMClassifier(
        objective='multiclass',
        num_class=num_classes,
        random_state=42
    )

    lgbm.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(10, verbose=False)]
    )

    # Save the trained model and the label encoder [7]
    joblib.dump(lgbm, MODEL_PATH)
    joblib.dump(label_encoder, ENCODER_PATH)

    print(f"Model trained and saved to {MODEL_PATH}")
    print(f"Label encoder saved to {ENCODER_PATH}")
    print("Classes:", list(label_encoder.classes_))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 425
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 9
[LightGBM] [Info] Start training from score -0.173847
[LightGBM] [Info] Start training from score -3.562466
[LightGBM] [Info] Start training from score -2.933857
[LightGBM] [Info] Start training from score -2.697468
[LightGBM] [Info] Start training from score -4.543295
Model trained and saved to ../models/lgbm_model.joblib
Label encoder saved to ../models/label_encoder.joblib
Classes: ['Body_Text', 'H1', 'H2', 'H3', 'H4']


In [4]:
# Load the trained model and test it on a few examples
try:
    model = joblib.load(MODEL_PATH)
    encoder = joblib.load(ENCODER_PATH)
    
    # Test predictions on a few examples from the test set
    if 'X_test' in locals() and 'y_test' in locals():
        print("Testing model predictions:")
        for i in range(min(5, len(X_test))):
            pred = model.predict([X_test.iloc[i]])[0]
            true_label = encoder.inverse_transform([y_test[i]])[0]
            pred_label = encoder.inverse_transform([pred])[0]
            print(f"Example {i+1}: True={true_label}, Predicted={pred_label}")
except Exception as e:
    print(f"Error loading model: {e}")


Testing model predictions:
Example 1: True=H3, Predicted=H3
Example 2: True=Body_Text, Predicted=H3
Example 3: True=Body_Text, Predicted=Body_Text
Example 4: True=Body_Text, Predicted=Body_Text
Example 5: True=Body_Text, Predicted=Body_Text




In [5]:
  from sklearn.metrics import classification_report, confusion_matrix
  print(classification_report(y_test, model.predict(X_test), target_names=encoder.classes_))
  print(confusion_matrix(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

   Body_Text       0.98      0.97      0.97        60
          H1       1.00      1.00      1.00         2
          H2       0.75      1.00      0.86         3
          H3       0.67      0.80      0.73         5
          H4       0.00      0.00      0.00         1

    accuracy                           0.94        71
   macro avg       0.68      0.75      0.71        71
weighted avg       0.94      0.94      0.94        71

[[58  0  1  1  0]
 [ 0  2  0  0  0]
 [ 0  0  3  0  0]
 [ 1  0  0  4  0]
 [ 0  0  0  1  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
