## In this code, I am running a CRFsuite model using folds created by Original paper.

### Running for all 5 folds for one topic

In [1]:
from sklearn_crfsuite import CRF
import os

# Function to load data

def load_data(filenames):
    X, y = [], []
    total_lines_read = 0
    total_lines_skipped = 0

    for filename in filenames:
        with open(filename, 'r') as f:
            lines_read = 0
            lines_skipped = 0
            X_fold, y_fold = [], []
            for line in f:
                lines_read += 1
                parts = line.strip().split()
                if not parts:
                    lines_skipped += 1
                    continue
                label = parts[0]
                features = {feat.split(':')[0]: float(feat.split(':')[1]) for feat in parts[1:]}
                X_fold.append([features])
                y_fold.append(label)
            X.extend(X_fold)
            y.extend(y_fold)
            
            print(f"File: {filename}")
            print(f"  Lines read: {lines_read}")
            print(f"  Lines skipped: {lines_skipped}")
            
            total_lines_read += lines_read
            total_lines_skipped += lines_skipped

    print("Total across all folds:")
    print(f"  Total lines read: {total_lines_read}")
    print(f"  Total lines skipped: {total_lines_skipped}")
    
    return X, y


In [2]:
# Directory containing the fold files

# dir_path = './core-tech/crf-tmp-1086/'
dir_path = './core-tech/crf-tmp-1439/'

In [4]:
# Lists to store combined predictions and gold labels

all_preds = []
all_golds = []

for i in range(5):
    print(f"Running pass {i}")
    
    # Identify training and testing files
    fold_files = [f for f in os.listdir(dir_path) if f.startswith('fold.')]
    training_files = [f for f in fold_files if not f.endswith(f"{i}")]
    testing_file = [f for f in fold_files if f.endswith(f"{i}")][0]
    
    print("training_files", training_files)
    print("testing_file", testing_file)
    
    # Load training and testing data
    X_train, y_train = load_data([os.path.join(dir_path, f) for f in training_files])
    X_test, y_test = load_data([os.path.join(dir_path, testing_file)])
    
    # Train the model
    crf = CRF(algorithm="pa",c=0.1, max_iterations=100, pa_type=2, verbose=True)
    crf.fit(X_train, y_train)
    
    # Predict on the testing set
    y_pred = crf.predict(X_test)
    
    # Save predictions and gold labels for this fold
    all_preds.extend([label for sublist in y_pred for label in sublist])
    all_golds.extend(y_test)

# After the loop, total counts
print(f"Total rows in all_preds: {len(all_preds)}")
print(f"Total rows in all_golds: {len(all_golds)}")

Running pass 0
training_files ['fold.3', 'fold.4', 'fold.2', 'fold.1']
testing_file fold.0
File: ./core-tech/crf-tmp-1439/fold.3
  Lines read: 51950
  Lines skipped: 25
File: ./core-tech/crf-tmp-1439/fold.4
  Lines read: 57876
  Lines skipped: 25
File: ./core-tech/crf-tmp-1439/fold.2
  Lines read: 53269
  Lines skipped: 25
File: ./core-tech/crf-tmp-1439/fold.1
  Lines read: 46586
  Lines skipped: 25
Total across all folds:
  Total lines read: 209681
  Total lines skipped: 100
File: ./core-tech/crf-tmp-1439/fold.0
  Lines read: 52686
  Lines skipped: 25
Total across all folds:
  Total lines read: 52686
  Total lines skipped: 25


loading training data to CRFsuite: 100%|█| 209581/209581 [00:18<00:00, 11338.77i



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1021444
Seconds required: 6.963

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=0.60  loss=844.92   feature_norm=0.61
Iter 2   time=0.55  loss=585.89   feature_norm=0.81
Iter 3   time=0.54  loss=512.78   feature_norm=0.95
Iter 4   time=0.52  loss=444.48   feature_norm=1.06
Iter 5   time=0.57  loss=427.03   feature_norm=1.15
Iter 6   time=0.56  loss=437.47   feature_norm=1.25
Iter 7   time=0.53  loss=416.91   feature_norm=1.33
Iter 8   time=0.52  loss=363.63   feature_norm=1.40
Iter 9   time=0.52  loss=372.71   feature_norm=1.47
Iter 10  time=0.52  loss=380.65   feature_norm=1.53
Iter 11  time=0.57  loss=358.90   feature_norm=1.59
Iter 12  time=0.54  loss=387.73   feature_norm=1.65
Iter 13  time=0.53  loss=354.34   feature_n

loading training data to CRFsuite: 100%|█| 215681/215681 [00:22<00:00, 9625.67it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1046651
Seconds required: 7.171

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=0.60  loss=839.55   feature_norm=0.61
Iter 2   time=0.56  loss=607.68   feature_norm=0.80
Iter 3   time=0.56  loss=535.41   feature_norm=0.94
Iter 4   time=0.56  loss=476.15   feature_norm=1.05
Iter 5   time=0.56  loss=414.48   feature_norm=1.15
Iter 6   time=0.56  loss=408.03   feature_norm=1.23
Iter 7   time=0.57  loss=396.53   feature_norm=1.30
Iter 8   time=0.57  loss=404.94   feature_norm=1.38
Iter 9   time=0.58  loss=379.43   feature_norm=1.45
Iter 10  time=0.58  loss=394.46   feature_norm=1.51
Iter 11  time=0.58  loss=356.05   feature_norm=1.56
Iter 12  time=0.57  loss=368.56   feature_norm=1.62
Iter 13  time=0.56  loss=371.28   feature_n

loading training data to CRFsuite: 100%|█| 208998/208998 [00:26<00:00, 7911.09it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1037365
Seconds required: 8.045

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=0.72  loss=849.78   feature_norm=0.61
Iter 2   time=0.72  loss=592.36   feature_norm=0.80
Iter 3   time=0.71  loss=529.79   feature_norm=0.94
Iter 4   time=0.71  loss=482.66   feature_norm=1.05
Iter 5   time=0.72  loss=458.97   feature_norm=1.15
Iter 6   time=0.71  loss=430.38   feature_norm=1.24
Iter 7   time=0.72  loss=413.56   feature_norm=1.31
Iter 8   time=0.71  loss=397.36   feature_norm=1.38
Iter 9   time=0.72  loss=348.78   feature_norm=1.44
Iter 10  time=0.71  loss=370.98   feature_norm=1.51
Iter 11  time=0.72  loss=387.36   feature_norm=1.57
Iter 12  time=0.72  loss=384.18   feature_norm=1.63
Iter 13  time=0.71  loss=382.71   feature_n

loading training data to CRFsuite: 100%|█| 210317/210317 [00:24<00:00, 8436.06it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1045136
Seconds required: 7.799

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=0.76  loss=843.31   feature_norm=0.59
Iter 2   time=0.75  loss=547.79   feature_norm=0.77
Iter 3   time=0.75  loss=483.35   feature_norm=0.91
Iter 4   time=0.76  loss=462.92   feature_norm=1.02
Iter 5   time=0.76  loss=450.17   feature_norm=1.12
Iter 6   time=0.75  loss=432.86   feature_norm=1.22
Iter 7   time=0.75  loss=377.68   feature_norm=1.29
Iter 8   time=0.76  loss=398.86   feature_norm=1.36
Iter 9   time=0.75  loss=350.97   feature_norm=1.41
Iter 10  time=0.76  loss=331.66   feature_norm=1.46
Iter 11  time=0.75  loss=365.07   feature_norm=1.53
Iter 12  time=0.76  loss=332.38   feature_norm=1.58
Iter 13  time=0.78  loss=331.80   feature_n

loading training data to CRFsuite: 100%|█| 204391/204391 [00:25<00:00, 7974.14it



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 976543
Seconds required: 7.144

Passive Aggressive
type: 2
c: 0.100000
error_sensitive: 1
averaging: 1
max_iterations: 100
epsilon: 0.000000

Iter 1   time=0.58  loss=782.47   feature_norm=0.59
Iter 2   time=0.57  loss=533.14   feature_norm=0.78
Iter 3   time=0.56  loss=454.03   feature_norm=0.90
Iter 4   time=0.56  loss=454.05   feature_norm=1.02
Iter 5   time=0.56  loss=415.94   feature_norm=1.11
Iter 6   time=0.56  loss=378.14   feature_norm=1.19
Iter 7   time=0.56  loss=390.55   feature_norm=1.26
Iter 8   time=0.56  loss=384.56   feature_norm=1.33
Iter 9   time=0.56  loss=358.54   feature_norm=1.40
Iter 10  time=0.56  loss=350.94   feature_norm=1.46
Iter 11  time=0.56  loss=339.81   feature_norm=1.51
Iter 12  time=0.56  loss=333.71   feature_norm=1.56
Iter 13  time=0.57  loss=317.07   feature_no

In [5]:
# # After the loop, report total counts
# print(f"Total rows in all_preds: {len(all_preds)}")
# print(f"Total rows in all_golds: {len(all_golds)}")

Total rows in all_preds: 262242
Total rows in all_golds: 262242


In [47]:
# Save combined predictions and gold labels to files
field = '1439_test_pa'

with open(f"{field}.pred.raw", "w") as pred_file:
    pred_file.write("\n".join(all_preds))

with open(f"{field}.gold.raw", "w") as gold_file:
    gold_file.write("\n".join(all_golds))

print("Post-Processing done.")

Post-Processing done.


In [57]:
# Load predictions from saved files
with open(f"{field}.pred.raw", "r") as file:
    y_pred_flat = [line.strip() for line in file.readlines()]

In [58]:
# Load gold from saved files
with open(f"{field}.gold.raw", "r") as file:
    y_test_flat = [line.strip() for line in file.readlines()]

In [59]:
len(y_pred_flat), len(y_test_flat)

(262242, 262242)

In [60]:
set(y_pred_flat)

{'1', 'B'}

In [61]:
set(y_test_flat)

{'1', 'B'}

In [62]:
B_string_count_pred = y_pred_flat.count('B')
B_string_count_gold = y_test_flat.count('B')

print("B - count in predictions:", B_string_count_pred)
print("B - count in Actual:", B_string_count_gold)

B - count in predictions: 261846
B - count in Actual: 261705


In [63]:
one_string_count_pred = y_pred_flat.count('1')
one_string_count_gold = y_test_flat.count('1')

print("1 - count in predictions:", one_string_count_pred)
print("1 - count in Actual:", one_string_count_gold)

1 - count in predictions: 396
1 - count in Actual: 537


In [15]:
tp, fp, fn = 0, 0, 0
for gold, pred in zip(y_test_flat, y_pred_flat):
    # Skip empty strings
    if gold == '' or pred == '':
        continue
    gold = 0 if gold == 'B' else int(gold)
    pred = 0 if pred == 'B' else int(pred)
    
    if gold == 1 and pred == 1:
        tp += 1
    elif gold != 1 and pred == 1:
        fp += 1
    elif gold == 1 and pred != 1:
        fn += 1

# Compute metrics
eps = 1e-6
precision = tp / (tp + fp + eps)
recall = tp / (tp + fn + eps)
f1 = 2 * (precision * recall) / (precision + recall + eps)

print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

Precision: 0.8737373715309662, Recall: 0.6443202967517313, F1: 0.741692971780551


### Loading the files from directory where gold and pred files are saved after running CLI experiment for the same topic

In [36]:
# Load predictions from predictions.txt
with open(f"{dir_path}/1439_testing_CL_pa.pred.raw", "r") as file:
    y_pred_flat_1 = [line.strip() for line in file.readlines()]

In [37]:
# Load gold from gold.txt
with open(f"{dir_path}/1439_testing_CL_pa.gold.raw", "r") as file:
    y_test_flat_1 = [line.strip() for line in file.readlines()]

In [38]:
set(y_pred_flat_1)

{'', '1', 'B'}

In [39]:
set(y_test_flat_1)

{'', '1', 'B'}

In [40]:
empty_string_count = y_test_flat_1.count('')
empty_string_count

125

In [54]:
B_string_count_pred_1 = y_pred_flat_1.count('B')
B_string_count_gold_1 = y_test_flat_1.count('B')

print("B - count in predictions:", B_string_count_pred_1)
print("B - count in Actual:", B_string_count_gold_1)

B - count in predictions: 261712
B - count in Actual: 261705


In [55]:
one_string_count_pred_1 = y_pred_flat_1.count('1')
one_string_count_gold_1 = y_test_flat_1.count('1')

print("1 - count in predictions:", one_string_count_pred_1)
print("1 - count in Actual:", one_string_count_gold_1)

1 - count in predictions: 530
1 - count in Actual: 537


In [41]:
tp, fp, fn = 0, 0, 0
for gold, pred in zip(y_test_flat_1, y_pred_flat_1):
    # Skip empty strings
    if gold == '' or pred == '':
        continue
    gold = 0 if gold == 'B' else int(gold)
    pred = 0 if pred == 'B' else int(pred)
    
    if gold == 1 and pred == 1:
        tp += 1
    elif gold != 1 and pred == 1:
        fp += 1
    elif gold == 1 and pred != 1:
        fn += 1

# Compute metrics
eps = 1e-6
precision = tp / (tp + fp + eps)
recall = tp / (tp + fn + eps)
f1 = 2 * (precision * recall) / (precision + recall + eps)

print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

Precision: 0.9132075454467782, Recall: 0.9013035364966414, F1: 0.9072159931666566
