First let us gather the number of Males, Females and Unknowns in the dataset to gather some values beforehand. This will be used in talking about potential biases in news.

In [10]:
import pandas as pd

# 1. Read in your CSV
df = pd.read_csv('thesis_NLP_component.csv')

# 2. Strip any stray whitespace
df['Ground truth'] = df['Ground truth'].str.strip()
df['Text-Category'] = df['Text-Category'].str.strip()

# 3. Extract the three counts into their own integer columns
gt = df['Ground truth'].str.extract(
    r'Male:\s*(\d+)\s*Female:\s*(\d+)\s*Unknown:\s*(\d+)',
    expand=True
)
gt.columns = ['Male','Female','Unknown']
gt = gt.astype(int)

# 4. Combine with category and group
counts = pd.concat([df['Text-Category'], gt], axis=1)
per_category = counts.groupby('Text-Category').sum()

# 5. Compute overall totals
overall = per_category.sum()

print("Counts per category:")
print(per_category)

print("\nOverall totals:")
print(f"Male:    {overall['Male']}")
print(f"Female:  {overall['Female']}")
print(f"Unknown: {overall['Unknown']}")

Counts per category:
                              Male  Female  Unknown
Text-Category                                      
Business and Finance           134      20       13
Crime and Justice              597     192       63
Economy and Trade               22      13        5
Education and Research          32       6        4
Entertainment and Media        363     179       34
Environment and Climate         48       9        8
Health and Medicine            264      78       47
Infrastructure and Transport    44       8        8
Politics                        83      22        5
Society and Culture            229     106       62
Sports                          54      14       19
Technology and Science          29      15        4

Overall totals:
Male:    1899
Female:  662
Unknown: 272


MAE (Mean Absolute Error): the average absolute difference between predicted and true counts per class per article.

In [None]:
import re

# 1. Read the data
df = pd.read_csv('thesis_NLP_component.csv')

# 2. Normalise the headers by removing any stray whitespace
df.columns = df.columns.str.strip()

# 2.1 Strip leading/trailing spaces from each count‐column
for col in ['Deepseek:32b', 'Named entity recognition', 'Ground truth']:
    df[col] = df[col].str.strip()

# 3. Function to parse count strings like "Male: 1 Female: 3 Unknown: 0"
def parse_counts(series):
    counts = series.str.extract(
        r'Male:\s*(\d+)\s*Female:\s*(\d+)\s*Unknown:\s*(\d+)',
        expand=True
    )
    counts.columns = ['Male', 'Female', 'Unknown']
    counts = counts.fillna(0)        # ← fill any non-matching rows with zeros
    return counts.astype(int)        # now safe to cast without ValueError

# 4. Parse each relevant column
deepseek     = parse_counts(df['Deepseek:32b'])
ground_truth = parse_counts(df['Ground truth'])
ner          = parse_counts(df['Named entity recognition'])

# 5. Function to compute MAE between two DataFrames of counts
def compute_mae(pred, true):
    abs_error = (pred - true).abs()
    total_abs_error = abs_error.values.sum()
    return total_abs_error / (len(df) * 3)  # 3 classes per article

# 6. Calculate MAEs
mae_deep_gt  = compute_mae(deepseek,     ground_truth)
mae_ner_gt   = compute_mae(ner,          ground_truth)
mae_deep_ner = compute_mae(deepseek,     ner)

# 7. Display results
print(f"MAE (Deepseek vs Ground Truth): {mae_deep_gt:.4f}")
print(f"MAE (NER vs Ground Truth):      {mae_ner_gt:.4f}")
print(f"MAE (Deepseek vs NER):          {mae_deep_ner:.4f}")

MAE (Deepseek vs Ground Truth): 0.3532
MAE (NER vs Ground Truth):      0.8670
MAE (Deepseek vs NER):          1.0115


Now for each category

In [None]:
# 1. Read and preprocess the data
df = pd.read_csv('thesis_NLP_component.csv')
df.columns = df.columns.str.strip()

# Strip whitespace from the category column
df['Text-Category'] = df['Text-Category'].str.strip()

# Strip whitespace from count columns
for col in ['Deepseek:32b', 'Named entity recognition', 'Ground truth']:
    df[col] = df[col].str.strip()

# 2. Function to parse the count strings
def parse_counts(series):
    counts = series.str.extract(
        r'Male:\s*(\d+)\s*Female:\s*(\d+)\s*Unknown:\s*(\d+)',
        expand=True
    )
    counts.columns = ['Male', 'Female', 'Unknown']
    return counts.fillna(0).astype(int)

# 3. Function to compute MAE between two DataFrames of counts
def compute_mae(pred, true):
    abs_error = (pred - true).abs()
    return abs_error.values.sum() / (len(pred) * 3)

# 4. Parse each relevant column
deepseek     = parse_counts(df['Deepseek:32b'])
ner          = parse_counts(df['Named entity recognition'])
ground_truth = parse_counts(df['Ground truth'])

# 5. Compute MAE per category
results = []
for category, idx in df.groupby('Text-Category').groups.items():
    ds = deepseek.loc[idx]
    nr = ner.loc[idx]
    gt = ground_truth.loc[idx]
    
    results.append({
        'Text Category': category,
        'MAE (Deepseek vs GT)': compute_mae(ds, gt),
        'MAE (NER vs GT)':      compute_mae(nr, gt),
        'MAE (Deepseek vs NER)':compute_mae(ds, nr)
    })

# 6. Build and display a DataFrame
mae_by_category = pd.DataFrame(results).set_index('Text Category')
print(mae_by_category)


                              MAE (Deepseek vs GT)  MAE (NER vs GT)  \
Text Category                                                         
Business and Finance                      0.133333         0.287719   
Crime and Justice                         0.366667         1.269444   
Economy and Trade                         0.150000         0.666667   
Education and Research                    0.066667         0.450000   
Entertainment and Media                   0.431579         1.210526   
Environment and Climate                   0.250000         0.466667   
Health and Medicine                       0.936842         0.859649   
Infrastructure and Transport              0.105263         0.684211   
Politics                                  0.150000         0.850000   
Society and Culture                       0.196491         0.877193   
Sports                                    0.233333         1.083333   
Technology and Science                    0.066667         0.550000   

     

This is for gender breakdowns

In [3]:
# 1. Absolute errors for each pairing
err_ds_gt = (deepseek - ground_truth).abs()
err_ner_gt = (ner       - ground_truth).abs()
err_ds_ner = (deepseek  - ner).abs()

# 2. Compute per-class MAE for each pairing
mae_ds_gt_male    = err_ds_gt['Male'].mean()
mae_ds_gt_female  = err_ds_gt['Female'].mean()
mae_ds_gt_unknown = err_ds_gt['Unknown'].mean()

mae_ner_gt_male    = err_ner_gt['Male'].mean()
mae_ner_gt_female  = err_ner_gt['Female'].mean()
mae_ner_gt_unknown = err_ner_gt['Unknown'].mean()

mae_ds_ner_male    = err_ds_ner['Male'].mean()
mae_ds_ner_female  = err_ds_ner['Female'].mean()
mae_ds_ner_unknown = err_ds_ner['Unknown'].mean()

# 3. Display the results
print("MAE (Deepseek vs Ground Truth):")
print(f"  Male:    {mae_ds_gt_male:.4f}")
print(f"  Female:  {mae_ds_gt_female:.4f}")
print(f"  Unknown: {mae_ds_gt_unknown:.4f}\n")

print("MAE (NER vs Ground Truth):")
print(f"  Male:    {mae_ner_gt_male:.4f}")
print(f"  Female:  {mae_ner_gt_female:.4f}")
print(f"  Unknown: {mae_ner_gt_unknown:.4f}\n")

print("MAE (Deepseek vs NER):")
print(f"  Male:    {mae_ds_ner_male:.4f}")
print(f"  Female:  {mae_ds_ner_female:.4f}")
print(f"  Unknown: {mae_ds_ner_unknown:.4f}")


MAE (Deepseek vs Ground Truth):
  Male:    0.3067
  Female:  0.1440
  Unknown: 0.6088

MAE (NER vs Ground Truth):
  Male:    1.2676
  Female:  0.7700
  Unknown: 0.5634

MAE (Deepseek vs NER):
  Male:    1.2958
  Female:  0.7355
  Unknown: 1.0031


MSE (Mean Squared Error): the average of the squared differences between predicted and true counts, penalising larger errors more heavily.

In [18]:
mse_deep_gt = ((deepseek - ground_truth) ** 2).values.sum() / (len(df) * 3)
mse_ner_gt  = ((ner - ground_truth)       ** 2).values.sum() / (len(df) * 3)
mse_deep_ner= ((deepseek - ner)           ** 2).values.sum() / (len(df) * 3)

print(f"MSE (Deepseek vs Ground Truth): {mse_deep_gt:.4f}")
print(f"MSE (NER vs Ground Truth):      {mse_ner_gt:.4f}")
print(f"MSE (Deepseek vs NER):          {mse_deep_ner:.4f}")

MSE (Deepseek vs Ground Truth): 21.5206
MSE (NER vs Ground Truth):      4.4799
MSE (Deepseek vs NER):          25.5477


RMSE (Root Mean Squared Error): the square root of MSE, giving error in the same units as the original counts.

In [19]:
import math

rmse_deep_gt = math.sqrt(mse_deep_gt)
rmse_ner_gt  = math.sqrt(mse_ner_gt)
rmse_deep_ner= math.sqrt(mse_deep_ner)

print(f"RMSE (Deepseek vs Ground Truth): {rmse_deep_gt:.4f}")
print(f"RMSE (NER vs Ground Truth):      {rmse_ner_gt:.4f}")
print(f"RMSE (Deepseek vs NER):          {rmse_deep_ner:.4f}")

RMSE (Deepseek vs Ground Truth): 4.6390
RMSE (NER vs Ground Truth):      2.1166
RMSE (Deepseek vs NER):          5.0545


Now for model performance across categories:

In [6]:
# 1. Read and preprocess the data
df = pd.read_csv('thesis_NLP_component.csv')
df.columns = df.columns.str.strip()
df['Text-Category'] = df['Text-Category'].str.strip()
for col in ['Deepseek:32b', 'Named entity recognition', 'Ground truth']:
    df[col] = df[col].str.strip()

# 2. Parse counts helper
def parse_counts(series):
    counts = series.str.extract(
        r'Male:\s*(\d+)\s*Female:\s*(\d+)\s*Unknown:\s*(\d+)',
        expand=True
    )
    counts.columns = ['Male','Female','Unknown']
    return counts.fillna(0).astype(int)

deepseek     = parse_counts(df['Deepseek:32b'])
ner          = parse_counts(df['Named entity recognition'])
ground_truth = parse_counts(df['Ground truth'])

# 3. Compute per-category MSE and RMSE
results = []
for category, idx in df.groupby('Text-Category').groups.items():
    ds = deepseek.loc[idx]
    nr = ner.loc[idx]
    gt = ground_truth.loc[idx]
    n = len(idx) * 3
    
    # MSEs
    mse_ds_gt  = ((ds  - gt)**2).values.sum()  / n
    mse_ner_gt = ((nr  - gt)**2).values.sum()  / n
    mse_ds_ner = ((ds  - nr)**2).values.sum()  / n
    
    # RMSEs
    rmse_ds_gt  = math.sqrt(mse_ds_gt)
    rmse_ner_gt = math.sqrt(mse_ner_gt)
    rmse_ds_ner = math.sqrt(mse_ds_ner)
    
    results.append({
        'Text Category': category,
        'MSE (Deepseek vs GT)':  mse_ds_gt,
        'MSE (NER vs GT)':       mse_ner_gt,
        'MSE (Deepseek vs NER)': mse_ds_ner,
        'RMSE (Deepseek vs GT)':  rmse_ds_gt,
        'RMSE (NER vs GT)':       rmse_ner_gt,
        'RMSE (Deepseek vs NER)': rmse_ds_ner,
    })

# 4. Build and display
metrics_by_category = pd.DataFrame(results).set_index('Text Category')
print(metrics_by_category)

                              MSE (Deepseek vs GT)  MSE (NER vs GT)  \
Text Category                                                         
Business and Finance                      0.217544         0.547368   
Crime and Justice                         0.783333         7.269444   
Economy and Trade                         0.183333         1.933333   
Education and Research                    0.066667         0.983333   
Entertainment and Media                   1.400000         7.484211   
Environment and Climate                   2.450000         1.733333   
Health and Medicine                     141.112281         3.715789   
Infrastructure and Transport              0.245614         1.280702   
Politics                                  0.150000         2.583333   
Society and Culture                       0.308772         6.561404   
Sports                                    0.300000         3.083333   
Technology and Science                    0.066667         1.016667   

     

Gender breakdown:

In [5]:
import numpy as np

# 1. Squared errors for each pairing
sq_err_ds_gt = (deepseek - ground_truth) ** 2
sq_err_ner_gt = (ner       - ground_truth) ** 2
sq_err_ds_ner = (deepseek  - ner)          ** 2

# 2. Compute per-class MSE (mean of squared errors)
mse_ds_gt_male    = sq_err_ds_gt['Male'].mean()
mse_ds_gt_female  = sq_err_ds_gt['Female'].mean()
mse_ds_gt_unknown = sq_err_ds_gt['Unknown'].mean()

mse_ner_gt_male    = sq_err_ner_gt['Male'].mean()
mse_ner_gt_female  = sq_err_ner_gt['Female'].mean()
mse_ner_gt_unknown = sq_err_ner_gt['Unknown'].mean()

mse_ds_ner_male    = sq_err_ds_ner['Male'].mean()
mse_ds_ner_female  = sq_err_ds_ner['Female'].mean()
mse_ds_ner_unknown = sq_err_ds_ner['Unknown'].mean()

# 3. Compute per-class RMSE
rmse_ds_gt_male    = np.sqrt(mse_ds_gt_male)
rmse_ds_gt_female  = np.sqrt(mse_ds_gt_female)
rmse_ds_gt_unknown = np.sqrt(mse_ds_gt_unknown)

rmse_ner_gt_male    = np.sqrt(mse_ner_gt_male)
rmse_ner_gt_female  = np.sqrt(mse_ner_gt_female)
rmse_ner_gt_unknown = np.sqrt(mse_ner_gt_unknown)

rmse_ds_ner_male    = np.sqrt(mse_ds_ner_male)
rmse_ds_ner_female  = np.sqrt(mse_ds_ner_female)
rmse_ds_ner_unknown = np.sqrt(mse_ds_ner_unknown)

# 4. Display the results
print("MSE (Deepseek vs Ground Truth):")
print(f"  Male:    {mse_ds_gt_male:.4f}")
print(f"  Female:  {mse_ds_gt_female:.4f}")
print(f"  Unknown: {mse_ds_gt_unknown:.4f}\n")

print("RMSE (Deepseek vs Ground Truth):")
print(f"  Male:    {rmse_ds_gt_male:.4f}")
print(f"  Female:  {rmse_ds_gt_female:.4f}")
print(f"  Unknown: {rmse_ds_gt_unknown:.4f}\n")

print("MSE (NER vs Ground Truth):")
print(f"  Male:    {mse_ner_gt_male:.4f}")
print(f"  Female:  {mse_ner_gt_female:.4f}")
print(f"  Unknown: {mse_ner_gt_unknown:.4f}\n")

print("RMSE (NER vs Ground Truth):")
print(f"  Male:    {rmse_ner_gt_male:.4f}")
print(f"  Female:  {rmse_ner_gt_female:.4f}")
print(f"  Unknown: {rmse_ner_gt_unknown:.4f}\n")

print("MSE (Deepseek vs NER):")
print(f"  Male:    {mse_ds_ner_male:.4f}")
print(f"  Female:  {mse_ds_ner_female:.4f}")
print(f"  Unknown: {mse_ds_ner_unknown:.4f}\n")

print("RMSE (Deepseek vs NER):")
print(f"  Male:    {rmse_ds_ner_male:.4f}")
print(f"  Female:  {rmse_ds_ner_female:.4f}")
print(f"  Unknown: {rmse_ds_ner_unknown:.4f}")


MSE (Deepseek vs Ground Truth):
  Male:    0.7856
  Female:  0.2848
  Unknown: 63.4914

RMSE (Deepseek vs Ground Truth):
  Male:    0.8863
  Female:  0.5337
  Unknown: 7.9681

MSE (NER vs Ground Truth):
  Male:    6.2097
  Female:  2.8858
  Unknown: 4.3443

RMSE (NER vs Ground Truth):
  Male:    2.4919
  Female:  1.6988
  Unknown: 2.0843

MSE (Deepseek vs NER):
  Male:    6.2535
  Female:  2.4632
  Unknown: 67.9264

RMSE (Deepseek vs NER):
  Male:    2.5007
  Female:  1.5695
  Unknown: 8.2418


This cell was used to check if all data was correct after analysis (Extra check)

In [None]:
# Read data and normalise headers
df = pd.read_csv('thesis_NLP_component.csv')
df.columns = df.columns.str.strip()

# Strip leading/trailing spaces from each count‐column
for col in ['Deepseek:32b', 'Named entity recognition', 'Ground truth']:
    df[col] = df[col].str.strip()

# Pattern for "Male: X Female: Y Unknown: Z"
pattern = r'Male:\s*(\d+)\s*Female:\s*(\d+)\s*Unknown:\s*(\d+)'

mismatches = {}
for col in ['Deepseek:32b', 'Named entity recognition', 'Ground truth']:
    mask = ~df[col].str.match(pattern, na=False)
    issues = df.loc[mask, ['Text-Link', col]]
    if not issues.empty:
        mismatches[col] = issues

if mismatches:
    for col, issues in mismatches.items():
        print(f"\nColumn '{col}' has {len(issues)} mismatched rows:")
        print(issues.to_string(index=False))
else:
    print("No format mismatches detected.")


No format mismatches detected.
