In [1]:
from data_loader import load_data, read_split, stratify, map_doc
from crf_model import train_crf_model, load_crf_model, predict_with_crf
from tokenizer import load_tokenizer
from features import extract_features_and_labels, process_text_and_extract_features
from evaluation import sentence_level_results, load_labels_and_create_spans, evaluate_annotations, evaluate_model_predictions
from train import train_loop
from test import test_loop
import pandas as pd

## Training CRF model

### Training function

In [3]:
def train_loop(train_data, model_save_dir, topic_id, fold):
    ''' This function is training and saving CRF model for each fold.'''
    
    train_sentences = train_data['sentence']
    train_labels = train_data['label']
    
    tokenizer = load_tokenizer('../../core-tech/custom_punkt_tokenizer.pkl')
    print("Preprocessing start")
    
    # Extract features and labels for each training
    train_extracted = [process_text_and_extract_features(sentence, label, tokenizer) for sentence, label in zip(train_sentences, train_labels)]
    
    
    X_train = [features for features, _ in train_extracted]
    y_train = [labels for _, labels in train_extracted]

    print(f"Training model for topic {topic_id} for fold {fold}")

    fold_model_dir = os.path.join(model_save_dir, topic_id)
    
    # Create the directory if it does not exist
    os.makedirs(fold_model_dir, exist_ok=True)

    # Define the full path for the model file
    fold_model_path = os.path.join(fold_model_dir, f"{topic_id}_crf_model_{fold}.pkl")
    
    train_crf_model(X_train, y_train, fold_model_path)

    print(f"CRF model saved for topic {topic_id}")


In [7]:
def test_loop(test_unbalanced, model_save_dir, folder, fold, results_df):
    
    import pandas as pd
    import os
    
    fold_model_path = os.path.join(model_save_dir, folder, f"{folder}_crf_model_{fold}.pkl")
    
    if os.path.exists(fold_model_path):
        crf_model = load_crf_model(fold_model_path)

        # Extract features and predict
        test_sentences = test_unbalanced['sentence']
        test_labels = test_unbalanced['label']
        
        tokenizer = load_tokenizer('../../core-tech/custom_punkt_tokenizer.pkl')

        test_extracted = [process_text_and_extract_features(sentence, label, tokenizer) for sentence, label in zip(test_sentences, test_labels)]
        
        X_test = [features for features, _ in test_extracted]
        y_test = [labels for _, labels in test_extracted]

        ## Predict
        y_pred = predict_with_crf(crf_model, X_test)
        results_df = evaluate_model_predictions(y_test, y_pred, fold, results_df, model_save_dir, folder)

    else:
        print(f"Model file not found: {fold_model_path}")
    
    return results_df


### Functions to load the data using predefined folds for 5 folds cross validation

In [2]:
import os

path = '../../core-tech/core/qrels/'
tokenizer_path = '../../core-tech/custom_punkt_tokenizer.pkl'
data_path  = '../../core-tech/due_dilligence_data.csv'
model_save_dir = 'raw_data_exp_25_04_24'

# List of folder names you want to process
# folders_to_process = ['1272', '1474','1238', '1275', '1239', '1520', '1509', '1240', '1308', '1319', '1439', 
#                  '1267', '1242', '1462', '1265', '1444', '1312', '1244', '1243', '1468', '1309', '1524', 
#                  '1247', '1440', '1251', '1249', '1248', '1262', '1250', '1252', '1245', '1512', '1498', 
#                  '1601', '1443', '1086', '1551', '1253', '1320', '1304', '1469', '1611', '1300', '1489', 
#                  '1500', '1261', '1318', '1460', '1475', '1321']

folders_to_process = [ '1524']

In [3]:
tokenizer = load_tokenizer(tokenizer_path)

### Data prepration and model training

In [4]:
df = load_data(data_path)

In [5]:
# Iterate over folders
for folder in folders_to_process:
    
    ## Dataframe to store the results
    results_df = pd.DataFrame(columns=["Topic ID", "Fold", "Sentence Precision", "Sentence Recall", "Sentence F1-Score",
                               "Annotation TP", "Annotation FP", "Annotation FN", 
                               "Annotation Precision", "Annotation Recall", "Annotation F1-Score"])


    if folder in os.listdir(path):
        for fold in range(1):
            
            ####### Data split into train and test sets #######
            print("For fold:", fold)
            test_split = fold
            train_split = [i for i in range(5) if i != test_split]
            
            test = read_split(f'{path}/{folder}/{folder}-{test_split}.cache')
            train = sum([read_split(f'{path}/{folder}/{folder}-{el}.cache') for el in train_split], [])

            df_ = df[df['topic_id'] == int(folder)].dropna()
#             df_ = df_.dropna()
            df_['doc_id'] = df_.apply(map_doc, axis=1)

            df_train = df_[df_['doc_id'].isin(train)]
            df_test = df_[df_['doc_id'].isin(test)]
                    
            ## genertae test data
            test_unbalanced, test_balanced = stratify(df_test)
            train_unbalanced, train_balanced = stratify(df_train)
            
            ######### Model training ##############

            os.makedirs(model_save_dir, exist_ok=True)
            
            ## train CRF model
            train_loop(train_unbalanced, model_save_dir, str(int(folder)), fold, tokenizer)
            
            ## test CRF model
            results_df = test_loop(test_unbalanced, model_save_dir, folder, fold, results_df, tokenizer)
    
    
    print("\n Completed all folds")
    # Save results to CSV
    results_csv_path = os.path.join(model_save_dir, folder, "final_results.csv")
    results_df.to_csv(results_csv_path, index=False)
    print(f"Results saved to {results_csv_path}")

For fold: 0
Preprocessing start
Training model for topic 1524 for fold 0


loading training data to CRFsuite: 100%|█| 341920/341920 [01:28<00:00, 3865.10it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1886534
Seconds required: 24.647

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=1.89  loss=16241.26 feature_norm=3.65
Iter 2   time=1.63  loss=12468.46 feature_norm=4.75
Iter 3   time=1.61  loss=11225.03 feature_norm=5.58
Iter 4   time=1.65  loss=10435.33 feature_norm=6.28
Iter 5   time=1.64  loss=9472.83  feature_norm=6.82
Iter 6   time=1.65  loss=9353.43  feature_norm=7.31
Iter 7   time=1.78  loss=8362.64  feature_norm=7.75
Iter 8   time=1.78  loss=8171.97  feature_norm=8.19
Iter 9   time=1.75  loss=7443.61  feature_norm=8.61
Iter 10  time=1.77  loss=8024.59  feature_norm=8.95
Iter 11  time=1.77  loss=7497.34  feature_norm=9.27
Iter 12  time=1.74  loss=7242.60  feature_norm=9.55
Iter 13  time=1.73  loss=6906.92  feature_

In [6]:
results_df

Unnamed: 0,Topic ID,Fold,Sentence Precision,Sentence Recall,Sentence F1-Score,Annotation TP,Annotation FP,Annotation FN,Annotation Precision,Annotation Recall,Annotation F1-Score
0,1524,0,0.925778,0.692116,0.792073,86,6,15,0.934783,0.851485,0.891191


### Calculating mean across topics

In [None]:
dataframes = []
# Step 1: Loop through each topic ID and read the CSV file
for topic_id in folders_to_process:
    csv_path = os.path.join('./',str(topic_id), "final_results.csv")
    print(csv_path)
    df = pd.read_csv(csv_path)
    dataframes.append(df)

# Step 2: Concatenate all DataFrames
combined_df = pd.concat(dataframes)
combined_df.head()

In [None]:
# Calculate mean for each topic across all folds
topic_means = combined_df.groupby('Topic ID').mean().drop(columns='Fold')

# Calculate the mean of means across all topics
overall_means = topic_means.mean()

# Print results
print("Mean Metrics for Each Topic:")
print(topic_means)
print("\nOverall Mean Metrics Across All Topics:")
print(overall_means)