In [1]:
import os
from sklearn.metrics import classification_report
import numpy as np
import sklearn_crfsuite
from sklearn.model_selection import KFold, train_test_split
from datasets import Dataset
import pickle
import itertools

In [2]:
import pandas as pd
df = pd.read_csv('due_dilligence_data.csv')
df.head()

Unnamed: 0,sentence,label,index,sent_id,topic_id
0,EXECUTION VERSION,B,0,150705_0,1252
1,THE BANK OF NOVA SCOTIA,B,1,150705_1,1252
2,"as Sole Bookrunner , Lead Arranger and Adminis...",B,2,150705_2,1252
3,- and-,B,3,150705_3,1252
4,BMO CAPITAL MARKETS,B,4,150705_4,1252


### Load the custom punkt tokenizer

In [4]:
# Load the tokenizer
with open('custom_punkt_tokenizer.pkl', 'rb') as f:
    loaded_tokenizer = pickle.load(f)

### Feature creation

## Training CRF model

### Training function

In [11]:
## train function
from itertools import chain

def train_crf_model(train_data, test_data, model_save_dir, topic_id, fold):
    ''' This function is training and saving CRF model for each fold.'''
    
    train_sentences = train_data['sentence']
    train_labels = train_data['label']
    
    # Extract features and labels for each training
    train_extracted = [process_text_and_extract_features(sentence, label) for sentence, label in zip(train_sentences, train_labels)]
    
    
    X_train = [features for features, _ in train_extracted]
    y_train = [labels for _, labels in train_extracted]

    print(train_sentences[:2])
    
    print("First sentence features in training data:", X_train[:2])
    print("First sentence labels in training data:", y_train[:2])

#     esd
    print(f"Training model for topic {topic_id} for fold {fold}")

    crf = sklearn_crfsuite.CRF(algorithm='pa', c=0.1, pa_type=2, max_iterations=100,
                               all_possible_transitions=True, verbose=True)
    crf.fit(X_train, y_train)
    
    # Save the trained CRF model for the topic
    fold_model_dir = os.path.join(model_save_dir, topic_id)
    
    # Create the directory if it does not exist
    os.makedirs(fold_model_dir, exist_ok=True)

    # Define the full path for the model file
    fold_model_path = os.path.join(fold_model_dir, f"{topic_id}_crf_model_{fold}.pkl")

    # Save the model
    with open(fold_model_path, "wb") as model_file:
        pickle.dump(crf, model_file)

    print(f"CRF model saved for topic {topic_id}")


### Functions to load the data using predefined folds for 5 folds cross validation

In [None]:
import os

path = 'core/qrels/'

def map_doc(row):
    return row['sent_id'].split('_')[0]

def read_split(file_):
    return [el for el in open(file_).read().split('\n') if el != '']

def map_doc(row):
    return row['sent_id'].split('_')[0]

def stratify(df):
    g = df.groupby('label')
    return df, g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

# List of folder names you want to process
# folders_to_process = ['1272', '1474','1238', '1275', '1239', '1520', '1509', '1240', '1308', '1319', '1439', 
#                  '1267', '1242', '1462', '1265', '1444', '1312', '1244', '1243', '1468', '1309', '1524', 
#                  '1247', '1440', '1251', '1249', '1248', '1262', '1250', '1252', '1245', '1512', '1498', 
#                  '1601', '1443', '1086', '1551', '1253', '1320', '1304', '1469', '1611', '1300', '1489', 
#                  '1500', '1261', '1318', '1460', '1475', '1321']

folders_to_process = [ '1250']

### Data prepration and model training

In [12]:
# Initialize lists to store predictions and labels

# Iterate over folders
for folder in folders_to_process:

    if folder in os.listdir(path):  # Check if the folder is actually in the directory
        for fold in range(5):
            
            ####### Data split into train and test sets #######
            print("for fold:", fold)
            test_split = fold
            train_split = [i for i in range(5) if i != test_split]
            
            test = read_split(f'{path}/{folder}/{folder}-{test_split}.cache')
            train = sum([read_split(f'{path}/{folder}/{folder}-{el}.cache') for el in train_split], [])

            df_ = df[df['topic_id'] == int(folder)]
            df_ = df_.dropna()
            df_['doc_id'] = df_.apply(map_doc, axis=1)

            df_test = df_[df_['doc_id'].isin(test)]
            df_train = df_[df_['doc_id'].isin(train)]

            test_unbalanced, test_balanced = stratify(df_test)
            train_unbalanced, train_balanced = stratify(df_train)
            
            print(test_balanced['label'].value_counts(), test_unbalanced['label'].value_counts())
            print(train_balanced['label'].value_counts(), train_unbalanced['label'].value_counts())
            
            ######### Model training ##############

            model_save_dir = 'raw_data_exp_25_04_24'
            os.makedirs(model_save_dir, exist_ok=True)
            
            ## train CRF model
            train_crf_model(train_unbalanced, test_unbalanced, model_save_dir, str(int(folder)), fold)

for fold: 0
label
1    25
B    25
Name: count, dtype: int64 label
B    179377
1        25
Name: count, dtype: int64
label
1    106
B    106
Name: count, dtype: int64 label
B    752019
1       106
Name: count, dtype: int64
32413173          EXECUTION VERSION
32413174    THE BANK OF NOVA SCOTIA
Name: sentence, dtype: object
First sentence features in training data: [[{'token': 'EXECUTION', 'lower': 'execution', 'is_first': True, 'is_last': False, 'is_capitalized': True, 'is_all_caps': True, 'is_all_lower': False, 'prefix-1': 'E', 'prefix-2': 'EX', 'prefix-3': 'EXE', 'suffix-1': 'N', 'suffix-2': 'ON', 'suffix-3': 'ION', 'prev_token': '', 'next_token': 'VERSION', 'is_numeric': False, 'unigram': 'EXECUTION', 'bigram': 'EXECUTION VERSION', 'trigram': ''}, {'token': 'VERSION', 'lower': 'version', 'is_first': False, 'is_last': True, 'is_capitalized': True, 'is_all_caps': True, 'is_all_lower': False, 'prefix-1': 'V', 'prefix-2': 'VE', 'prefix-3': 'VER', 'suffix-1': 'N', 'suffix-2': 'ON', 'suffi

loading training data to CRFsuite: 100%|█| 752125/752125 [03:59<00:00, 3140.20it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2807308
Seconds required: 62.713

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=6.46  loss=2948.28  feature_norm=1.20
Iter 2   time=6.16  loss=2316.47  feature_norm=1.67
Iter 3   time=6.22  loss=2015.79  feature_norm=1.99
Iter 4   time=6.73  loss=1449.58  feature_norm=2.20
Iter 5   time=6.47  loss=1181.80  feature_norm=2.37
Iter 6   time=6.36  loss=1072.01  feature_norm=2.49
Iter 7   time=6.25  loss=913.89   feature_norm=2.59
Iter 8   time=6.10  loss=732.78   feature_norm=2.68
Iter 9   time=6.33  loss=747.02   feature_norm=2.75
Iter 10  time=6.11  loss=594.49   feature_norm=2.79
Iter 11  time=6.14  loss=582.27   feature_norm=2.84
Iter 12  time=6.11  loss=464.35   feature_norm=2.88
Iter 13  time=6.34  loss=609.99   feature_

loading training data to CRFsuite: 100%|█| 746068/746068 [04:18<00:00, 2881.39it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2859236
Seconds required: 60.677

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=7.26  loss=3164.31  feature_norm=1.20
Iter 2   time=6.66  loss=2173.70  feature_norm=1.66
Iter 3   time=6.57  loss=1767.42  feature_norm=1.95
Iter 4   time=6.50  loss=1489.02  feature_norm=2.17
Iter 5   time=6.62  loss=1299.38  feature_norm=2.35
Iter 6   time=6.52  loss=947.41   feature_norm=2.48
Iter 7   time=6.54  loss=1071.63  feature_norm=2.59
Iter 8   time=6.46  loss=719.61   feature_norm=2.68
Iter 9   time=6.84  loss=934.32   feature_norm=2.78
Iter 10  time=6.78  loss=789.49   feature_norm=2.88
Iter 11  time=6.68  loss=620.79   feature_norm=2.95
Iter 12  time=6.56  loss=590.77   feature_norm=2.99
Iter 13  time=6.54  loss=454.22   feature_

loading training data to CRFsuite: 100%|█| 719850/719850 [04:10<00:00, 2872.96it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2768289
Seconds required: 66.560

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=6.88  loss=2877.50  feature_norm=1.15
Iter 2   time=6.48  loss=1977.74  feature_norm=1.57
Iter 3   time=6.39  loss=1688.66  feature_norm=1.87
Iter 4   time=6.39  loss=1432.63  feature_norm=2.12
Iter 5   time=6.32  loss=1247.31  feature_norm=2.30
Iter 6   time=6.43  loss=849.14   feature_norm=2.42
Iter 7   time=6.59  loss=779.25   feature_norm=2.50
Iter 8   time=6.81  loss=616.31   feature_norm=2.57
Iter 9   time=6.35  loss=768.29   feature_norm=2.67
Iter 10  time=6.46  loss=653.81   feature_norm=2.75
Iter 11  time=6.48  loss=468.06   feature_norm=2.81
Iter 12  time=6.22  loss=397.39   feature_norm=2.84
Iter 13  time=6.34  loss=309.91   feature_

loading training data to CRFsuite: 100%|█| 747439/747439 [04:05<00:00, 3048.99it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2796712
Seconds required: 61.890

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=7.13  loss=3012.18  feature_norm=1.18
Iter 2   time=6.83  loss=2126.32  feature_norm=1.62
Iter 3   time=6.75  loss=1785.39  feature_norm=1.94
Iter 4   time=6.29  loss=1641.70  feature_norm=2.19
Iter 5   time=6.40  loss=1419.06  feature_norm=2.41
Iter 6   time=6.48  loss=1108.09  feature_norm=2.52
Iter 7   time=6.92  loss=816.65   feature_norm=2.62
Iter 8   time=6.77  loss=767.93   feature_norm=2.72
Iter 9   time=6.70  loss=695.41   feature_norm=2.80
Iter 10  time=7.29  loss=580.39   feature_norm=2.84
Iter 11  time=6.52  loss=721.76   feature_norm=2.90
Iter 12  time=6.59  loss=514.17   feature_norm=2.95
Iter 13  time=6.66  loss=552.63   feature_

loading training data to CRFsuite: 100%|█| 760626/760626 [04:31<00:00, 2796.48it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2854799
Seconds required: 65.810

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=7.86  loss=3106.78  feature_norm=1.18
Iter 2   time=7.47  loss=2394.60  feature_norm=1.64
Iter 3   time=7.15  loss=1844.72  feature_norm=1.94
Iter 4   time=7.62  loss=1666.57  feature_norm=2.13
Iter 5   time=7.22  loss=1261.67  feature_norm=2.32
Iter 6   time=7.15  loss=1150.85  feature_norm=2.48
Iter 7   time=7.05  loss=932.07   feature_norm=2.60
Iter 8   time=6.99  loss=981.41   feature_norm=2.69
Iter 9   time=6.96  loss=567.79   feature_norm=2.75
Iter 10  time=7.11  loss=694.05   feature_norm=2.82
Iter 11  time=10.29 loss=411.88   feature_norm=2.85
Iter 12  time=8.21  loss=439.27   feature_norm=2.89
Iter 13  time=7.05  loss=529.47   feature_

## Model evaluation

### Sentence level eval

In [7]:
def sentence_level_results(true_labels_flat, predicted_labels_flat):
    
    tp, fp, fn = 0, 0, 0
    for gold, pred in zip(true_labels_flat, predicted_labels_flat):
        # Skip empty strings
        if gold == '' or pred == '':
            continue
        gold = 0 if gold == 'B' else int(gold)
        pred = 0 if pred == 'B' else int(pred)

        if gold == 1 and pred == 1:
            tp += 1
        elif gold != 1 and pred == 1:
            fp += 1
        elif gold == 1 and pred != 1:
            fn += 1

    # Compute metrics
    eps = 1e-6
    precision = tp / (tp + fp + eps)
    recall = tp / (tp + fn + eps)
    f1 = 2 * (precision * recall) / (precision + recall + eps)

    print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")
    
    return precision, recall, f1

### Annotation level eval

In [8]:
def labels_to_spans(labels):
    """
    Converts a list of labels ('1' or 'B') into spans.
    
    Args:
    - labels: List of labels ('1' or 'B') for tokens.

    Returns:
    - A list of spans represented as tuples (start, end).
    """
    start = None
    spans = []
    for pos, label in enumerate(labels):
        if label == "1" and start is None:  # Start of a new span
            start = pos
        elif label != "1" and start is not None:  # End of the current span
            spans.append((start, pos - 1))
            start = None
    if start is not None:  # If a span extends to the end of the sequence
        spans.append((start, len(labels) - 1))
        
    print(len(spans))
        
    return spans

In [9]:
def load_labels_and_create_spans(file_path, output_file):
    with open(file_path, 'r') as f:
        raw_data = f.readlines()
        
    # Initialize variables
    current_sequence = []
    all_spans = []
    
    for line in raw_data:
        label = line.strip()
        if label:  # If the line is not empty, add the label to the current sequence
            current_sequence.append(label)
        else:  # If the line is empty, process the current sequence and reset it
            if current_sequence:  # Check if the current sequence is not empty
                spans = labels_to_spans(current_sequence)
                all_spans.extend(spans)
                current_sequence = []  # Reset the sequence for the next block

    # Process the last sequence if the file doesn't end with a blank line
    if current_sequence:
        spans = labels_to_spans(current_sequence)
        all_spans.extend(spans)
    
    # Write the spans to the output file
    with open(output_file, 'w') as f:
        for span in all_spans:
            f.write(f"{span[0]} {span[1]}\n")

In [10]:
def parse_span_file(filename):
    """Parses a span file to extract spans."""
    spans = []
    with open(filename) as fil:
        for line in fil:
            start, end = map(int, line.strip().split())
            spans.append({'start': start, 'end': 1 + end})
    return spans

def span_overlaps(A, B):
    """Checks if two spans overlap."""
    return not ((A['end'] <= B['start']) or (A['start'] >= B['end']))

def overlapping_spans(A, B):
    """Finds overlapping spans between two lists of spans."""
    return [a for a in A if any(span_overlaps(a, b) for b in B)]

def non_overlapping_spans(A, B):
    """Finds spans in A that do not overlap with any span in B."""
    return [a for a in A if not any(span_overlaps(a, b) for b in B)]

def evaluate_annotations(gold_file, pred_file):
    """Evaluates the predicted annotations against the gold standard."""
    gold = parse_span_file(gold_file)
    pred = parse_span_file(pred_file)
    
    eps = 0.000001
    tp = len(overlapping_spans(gold, pred))
    fp = len(non_overlapping_spans(pred, gold))
    fn = len(non_overlapping_spans(gold, pred))
    
    recall = tp / float(tp + fn + eps)
    precision = tp / float(tp + fp + eps)
    f1 = 2 * (precision * recall) / (precision + recall + eps)
    
    return {"TP": tp, "FP": fp, "FN": fn, "Precision": precision, "Recall": recall, "F1-Score": f1}

In [None]:
import pickle
from itertools import chain
import os
import sklearn_crfsuite

def load_crf_model(model_path):
    """Load a saved CRF model from a file."""
    with open(model_path, 'rb') as model_file:
        crf_model = pickle.load(model_file)
    return crf_model

def predict_with_crf(crf_model, X_test):
    """Predict using a loaded CRF model."""
    return crf_model.predict(X_test)

In [65]:
def evaluate_model_predictions(y_test, y_pred, fold, results_df, model_save_dir, folder):
    """Evaluate the model predictions using appropriate metrics."""
    

    topic_id = folder
    y_test_flat = list(chain.from_iterable(y_test))
    y_pred_flat = list(chain.from_iterable(y_pred))
    
    # Save the combined predictions and gold labels for the topic
    pred_file_path = os.path.join(model_save_dir, topic_id, f"{topic_id}_{fold}.pred.raw")
    gold_file_path = os.path.join(model_save_dir, topic_id, f"{topic_id}_{fold}.gold.raw")

    with open(pred_file_path, "w") as pred_file, open(gold_file_path, "w") as gold_file:
        pred_file.write("\n".join(y_test_flat))
        gold_file.write("\n".join(y_pred_flat))
    
    precision, recall, f1_score = sentence_level_results(y_test_flat, y_pred_flat)
    print(f"Avg Precision: {precision:.4f}, Avg Recall: {recall:.4f}, Avg F1-Score: {f1_score:.4f}")
    
    # Create spans and save to .span files
    load_labels_and_create_spans(pred_file_path, os.path.join(model_save_dir, topic_id, f"{topic_id}_{fold}.pred.span"))
    load_labels_and_create_spans(gold_file_path, os.path.join(model_save_dir, topic_id, f"{topic_id}_{fold}.gold.span"))

    # Evaluate the annotations
    metrics = evaluate_annotations(os.path.join(model_save_dir, topic_id, f"{topic_id}_{fold}.gold.span"), os.path.join(model_save_dir, topic_id, f"{topic_id}_{fold}.pred.span"))
    print(metrics)
    
    new_row = pd.DataFrame([{
        "Topic ID": topic_id,
        "Fold": fold,
        "Sentence Precision": precision,
        "Sentence Recall": recall,
        "Sentence F1-Score": f1_score,
        "Annotation TP": metrics["TP"],
        "Annotation FP": metrics["FP"],
        "Annotation FN": metrics["FN"],
        "Annotation Precision": metrics["Precision"],
        "Annotation Recall": metrics["Recall"],
        "Annotation F1-Score": metrics["F1-Score"],
    }], columns=results_df.columns)

    results_df = pd.concat([results_df, new_row], ignore_index=True)
    
    return results_df

In [66]:
def test_crf_model(model_save_dir, folders_to_process, path, df):
    for folder in folders_to_process:
        
        if folder in os.listdir(path):
            
            ## Dataframe to store the results
            results_df = pd.DataFrame(columns=["Topic ID", "Fold", "Sentence Precision", "Sentence Recall", "Sentence F1-Score",
                                       "Annotation TP", "Annotation FP", "Annotation FN", 
                                       "Annotation Precision", "Annotation Recall", "Annotation F1-Score"])
            for fold in range(5):
                print(f"Processing topic {folder}, fold {fold}")
                ### Load the saved model for the topic and fold
                fold_model_path = os.path.join(model_save_dir, folder, f"{folder}_crf_model_{fold}.pkl")
                if os.path.exists(fold_model_path):
                    crf_model = load_crf_model(fold_model_path)
                    
                    # Load test data splits
                    test = read_split(f'{path}/{folder}/{folder}-{fold}.cache')
                    
                    ## filter data for topic
                    df_ = df[df['topic_id'] == int(folder)]
                    df_ = df_.dropna()
                    df_['doc_id'] = df_.apply(map_doc, axis=1)

                    df_test = df_[df_['doc_id'].isin(test)]
                    
                    ## genertae test data
                    test_unbalanced, test_balanced = stratify(df_test)

                    # Extract features and predict
                    test_sentences = test_unbalanced['sentence']
                    test_labels = test_unbalanced['label']
                    
                    test_extracted = [process_text_and_extract_features(sentence, label) for sentence, label in zip(test_sentences, test_labels)]
                    X_test = [features for features, _ in test_extracted]
                    y_test = [labels for _, labels in test_extracted]
                    
                    ## Predict
                    y_pred = predict_with_crf(crf_model, X_test)
                    results_df = evaluate_model_predictions(y_test, y_pred, fold, results_df, model_save_dir, folder)
                    
                else:
                    print(f"Model file not found: {fold_model_path}")
                    
            print("\n Completed all folds")
            # Save results to CSV
            results_csv_path = os.path.join(model_save_dir, folder, "final_results.csv")
            results_df.to_csv(results_csv_path, index=False)
            print(f"Results saved to {results_csv_path}")
            
    return results_df

In [1]:
path = 'core/qrels/'
model_save_dir = 'raw_data_exp_25_04_24'
folders_to_process = ['1272', '1474','1238', '1275', '1239', '1520', '1509', '1240', '1308', '1319', '1439', 
                 '1267', '1242', '1462', '1265', '1444', '1312', '1244', '1243', '1468', '1309', '1524', 
                 '1247', '1440', '1251', '1249', '1248', '1262', '1250', '1252', '1245', '1512', '1498', 
                 '1601', '1443', '1086', '1551', '1253', '1320', '1304', '1469', '1611', '1300', '1489', 
                 '1500', '1261', '1318', '1460', '1475', '1321']


results_df = test_crf_model(model_save_dir, folders_to_process, path, df)

### Calculating mean across topics

In [None]:
dataframes = []
# Step 1: Loop through each topic ID and read the CSV file
for topic_id in folders_to_process:
    csv_path = os.path.join('./',str(topic_id), "final_results.csv")
    print(csv_path)
    df = pd.read_csv(csv_path)
    dataframes.append(df)

# Step 2: Concatenate all DataFrames
combined_df = pd.concat(dataframes)
combined_df.head()

In [None]:
# Calculate mean for each topic across all folds
topic_means = combined_df.groupby('Topic ID').mean().drop(columns='Fold')

# Calculate the mean of means across all topics
overall_means = topic_means.mean()

# Print results
print("Mean Metrics for Each Topic:")
print(topic_means)
print("\nOverall Mean Metrics Across All Topics:")
print(overall_means)