In [1]:
# Analyze data/test.source
with open('data/test.source', 'r', encoding='utf-8') as f:
    lines = f.readlines()

print(f"Number of lines in test.source: {len(lines)}")
for i, line in enumerate(lines[:5]):  # Print the first 5 lines
    print(f"Source {i + 1}: {line.strip()}")


Number of lines in test.source: 240
Source 3: Severnside Provisions, of Leeway Industrial Estate, was also ordered to pay £400,000 at the city's crown court on Friday. Directors Anthony O'Sullivan, 47, and Martin Lincoln, 46, were given 24-week terms after admitting 12 offences. An investigation was launched in December 2013. In a statement, Newport city council said the pair were warned in 2010 that their company was not permitted to carry out turkey processing. However, it said records showed that large quantities of meat was sold by Severnside Provisions in 2011, 2012 and 2013. After an investigation started, environmental health officers found large quantities of turkey defrosting outside in dirty water and being processed in a garage in "unhygienic conditions". Items seized were destroyed but the Food Standards Agency launched a national recall after finding turkeys had already been sold to butchers' shops and restaurants. The council said O'Sullivan and Lincoln had admitted 12 fo

In [2]:
import json

# Analyze data/test.json
with open('data/test.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Number of entries in test.json: {len(data)}")
for i, entry in enumerate(data[:3]):  # Print the first 3 entries
    print(f"Entry {i + 1}:")
    print(f"  Source: {entry['source'][:100]}...")  # Print the first 100 characters of the source
    print(f"  Entities: {entry['entities']}")


Number of entries in test.json: 240
Entry 1:
  Source: The greatest foreign policy disasters have tended to come when the UK has either ignored America - s...
  Entities: [{'start': 67, 'end': 69, 'label': 'Non-hallucinated', 'type': 'GPE', 'ent': 'UK'}, {'start': 89, 'end': 96, 'label': 'Non-hallucinated', 'type': 'GPE', 'ent': 'America'}, {'start': 122, 'end': 128, 'label': 'Non-hallucinated', 'type': 'GPE', 'ent': 'France'}, {'start': 176, 'end': 178, 'label': 'Non-hallucinated', 'type': 'GPE', 'ent': 'US'}, {'start': 214, 'end': 218, 'label': 'Non-hallucinated', 'type': 'GPE', 'ent': 'Iraq'}]
Entry 2:
  Source: "I am seriously considering emigrating to Australia, Jamaica or the US," says Faye Jones. The 32-yea...
  Entities: [{'start': 98, 'end': 109, 'label': 'Non-hallucinated', 'type': 'CARDINAL', 'ent': 'one million'}, {'start': 128, 'end': 130, 'label': 'Non-hallucinated', 'type': 'GPE', 'ent': 'UK'}]
Entry 3:
  Source: Severnside Provisions, of Leeway Industrial Estate, was al

In [3]:
# Analyze data/train.json
with open('data/train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

print(f"Number of entries in train.json: {len(train_data)}")
for i, entry in enumerate(train_data[:3]):  # Print the first 3 entries
    print(f"Entry {i + 1}:")
    print(f"  Source: {entry['source'][:100]}...")  # Print the first 100 characters of the source
    print(f"  Entities: {entry['entities']}")


Number of entries in train.json: 460
Entry 1:
  Source: Ch Insp David Pettigrew believes the cameras would act as a deterrent to disorder and vandalism. Gla...
  Entities: [{'start': 39, 'end': 43, 'label': 'Non-hallucinated', 'type': 'ORG', 'ent': 'CCTV'}]
Entry 2:
  Source: Coleman senior died in 2014 after his son's first campaign in charge of Wales had ended with the fai...
  Entities: [{'start': 0, 'end': 5, 'label': 'Non-hallucinated', 'type': 'GPE', 'ent': 'Wales'}, {'start': 14, 'end': 19, 'label': 'Non-hallucinated', 'type': 'PERSON', 'ent': 'Chris'}, {'start': 20, 'end': 27, 'label': 'Non-hallucinated', 'type': 'PERSON', 'ent': 'Coleman'}, {'start': 49, 'end': 54, 'label': 'Non-hallucinated', 'type': 'PERSON', 'ent': 'Chris'}, {'start': 107, 'end': 118, 'label': 'Non-factual Hallucination', 'type': 'PERSON', 'ent': 'semi-finals'}]
Entry 3:
  Source: The 60-year-old man was scuba diving at Agincourt Reef in Far North Queensland when he was seen to b...
  Entities: [{'start': 9

In [4]:
# Cross-check alignment between test.source and test.json
with open('data/test.source', 'r', encoding='utf-8') as f:
    source_lines = [line.strip() for line in f.readlines()]

with open('data/test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

for i, (source_line, json_entry) in enumerate(zip(source_lines, test_data)):
    if source_line != json_entry['source']:
        print(f"Mismatch at line {i + 1}:")
        print(f"  test.source: {source_line}")
        print(f"  test.json: {json_entry['source']}")


In [None]:
from collections import Counter

# Analyze entity label and type distribution
def analyze_entity_distribution(data, dataset_name):
    labels = []
    types = []
    for entry in data:
        for entity in entry['entities']:
            labels.append(entity.get('label', 'Unknown'))
            types.append(entity.get('type', 'Unknown'))  
    
    print(f"Entity label distribution in {dataset_name}:")
    print(Counter(labels))
    print(f"Entity type distribution in {dataset_name}:")
    print(Counter(types))

# Run the analysis
analyze_entity_distribution(test_data, "test.json")
analyze_entity_distribution(train_data, "train.json")


Entity label distribution in test.json:
Counter({'Non-hallucinated': 583, 'Non-factual Hallucination': 134, 'Factual Hallucination': 118, 'Intrinsic Hallucination': 8})
Entity type distribution in test.json:
Counter({'PERSON': 264, 'GPE': 173, 'ORG': 125, 'DATE': 98, 'CARDINAL': 61, 'NORP': 47, 'ORDINAL': 19, 'LOC': 15, 'EVENT': 11, 'MONEY': 10, 'PERCENT': 5, 'TIME': 4, 'PRODUCT': 3, 'WORK_OF_ART': 2, 'QUANTITY': 2, 'Unknown': 2, 'FAC': 2})
Entity label distribution in train.json:
Counter({'Non-hallucinated': 1093, 'Factual Hallucination': 270, 'Non-factual Hallucination': 230, 'Intrinsic Hallucination': 39})
Entity type distribution in train.json:
Counter({'PERSON': 483, 'GPE': 355, 'ORG': 252, 'DATE': 177, 'CARDINAL': 106, 'NORP': 99, 'ORDINAL': 42, 'LOC': 28, 'MONEY': 24, 'FAC': 21, 'EVENT': 15, 'WORK_OF_ART': 10, 'TIME': 8, 'QUANTITY': 4, 'PRODUCT': 4, 'PERCENT': 2, 'LAW': 2})


In [7]:
# Check for missing or empty data
def check_missing_data(data, dataset_name):
    for i, entry in enumerate(data):
        if not entry['source']:
            print(f"Missing source in {dataset_name} at entry {i + 1}")
        if not entry['entities']:
            print(f"No entities in {dataset_name} at entry {i + 1}")

check_missing_data(test_data, "test.json")
check_missing_data(train_data, "train.json")


In [8]:
# Analyze length of source and number of entities
def analyze_lengths(data, dataset_name):
    source_lengths = [len(entry['source'].split()) for entry in data]
    entity_counts = [len(entry['entities']) for entry in data]
    
    print(f"{dataset_name} Source Lengths:")
    print(f"  Average: {sum(source_lengths)/len(source_lengths):.2f}")
    print(f"  Max: {max(source_lengths)}, Min: {min(source_lengths)}")
    print(f"{dataset_name} Entity Counts:")
    print(f"  Average: {sum(entity_counts)/len(entity_counts):.2f}")
    print(f"  Max: {max(entity_counts)}, Min: {min(entity_counts)}")

analyze_lengths(test_data, "test.json")
analyze_lengths(train_data, "train.json")


test.json Source Lengths:
  Average: 353.32
  Max: 1725, Min: 36
test.json Entity Counts:
  Average: 3.51
  Max: 11, Min: 1
train.json Source Lengths:
  Average: 345.24
  Max: 1472, Min: 49
train.json Entity Counts:
  Average: 3.55
  Max: 12, Min: 1


In [9]:
# Search for specific keywords in source texts
keywords = ["Trump", "Brexit", "globalisation"]

def keyword_search(data, dataset_name, keywords):
    keyword_count = {keyword: 0 for keyword in keywords}
    for entry in data:
        for keyword in keywords:
            if keyword in entry['source']:
                keyword_count[keyword] += 1
    
    print(f"Keyword search results in {dataset_name}:")
    for keyword, count in keyword_count.items():
        print(f"  {keyword}: {count}")

keyword_search(test_data, "test.json", keywords)
keyword_search(train_data, "train.json", keywords)


Keyword search results in test.json:
  Trump: 7
  Brexit: 6
  globalisation: 1
Keyword search results in train.json:
  Trump: 7
  Brexit: 6
  globalisation: 0


In [10]:
# Check for overlaps between train.json and test.json
train_sources = {entry['source'] for entry in train_data}
test_sources = {entry['source'] for entry in test_data}

overlaps = train_sources.intersection(test_sources)
if overlaps:
    print("Overlapping sources found between train.json and test.json:")
    for overlap in overlaps:
        print(overlap)
else:
    print("No overlaps found between train.json and test.json.")


No overlaps found between train.json and test.json.


In [12]:
# Save analyzed data
with open('data/analyzed_test.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

with open('data/analyzed_train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)


In [13]:
# Cross-analyze type and label patterns
def analyze_type_label_patterns(data, dataset_name):
    type_label_pairs = Counter()
    for entry in data:
        for entity in entry['entities']:
            pair = (entity.get('type', 'Unknown'), entity.get('label', 'Unknown'))
            type_label_pairs[pair] += 1

    print(f"--- {dataset_name} Type-Label Pair Patterns ---")
    for pair, count in type_label_pairs.most_common():
        print(f"  {pair}: {count}")

# Run for test.json and train.json
analyze_type_label_patterns(test_data, "test.json")
analyze_type_label_patterns(train_data, "train.json")


--- test.json Type-Label Pair Patterns ---
  ('PERSON', 'Non-hallucinated'): 197
  ('GPE', 'Non-hallucinated'): 136
  ('ORG', 'Non-hallucinated'): 90
  ('DATE', 'Non-hallucinated'): 43
  ('DATE', 'Non-factual Hallucination'): 41
  ('PERSON', 'Factual Hallucination'): 38
  ('CARDINAL', 'Non-hallucinated'): 37
  ('NORP', 'Non-hallucinated'): 35
  ('PERSON', 'Non-factual Hallucination'): 28
  ('ORG', 'Factual Hallucination'): 23
  ('GPE', 'Factual Hallucination'): 22
  ('CARDINAL', 'Non-factual Hallucination'): 20
  ('ORDINAL', 'Non-hallucinated'): 14
  ('DATE', 'Factual Hallucination'): 14
  ('GPE', 'Non-factual Hallucination'): 13
  ('ORG', 'Non-factual Hallucination'): 12
  ('LOC', 'Non-hallucinated'): 9
  ('NORP', 'Factual Hallucination'): 9
  ('EVENT', 'Non-hallucinated'): 7
  ('MONEY', 'Non-factual Hallucination'): 6
  ('ORDINAL', 'Non-factual Hallucination'): 4
  ('CARDINAL', 'Factual Hallucination'): 3
  ('PRODUCT', 'Non-hallucinated'): 3
  ('EVENT', 'Factual Hallucination'): 3
  

In [14]:
# Analyze source text lengths
def analyze_source_lengths(data, dataset_name):
    source_lengths = [len(entry['source'].split()) for entry in data]
    print(f"--- {dataset_name} Source Length Analysis ---")
    print(f"  Average length: {sum(source_lengths)/len(source_lengths):.2f} words")
    print(f"  Max length: {max(source_lengths)}, Min length: {min(source_lengths)}")

# Run for test.json and train.json
analyze_source_lengths(test_data, "test.json")
analyze_source_lengths(train_data, "train.json")


--- test.json Source Length Analysis ---
  Average length: 353.32 words
  Max length: 1725, Min length: 36
--- train.json Source Length Analysis ---
  Average length: 345.24 words
  Max length: 1472, Min length: 49


In [15]:
# Analyze entity lengths and positions
def analyze_entity_lengths(data, dataset_name):
    entity_lengths = []
    start_positions = []
    end_positions = []

    for entry in data:
        for entity in entry['entities']:
            entity_lengths.append(len(entity.get('ent', '')))
            start_positions.append(entity['start'])
            end_positions.append(entity['end'])

    print(f"--- {dataset_name} Entity Length Analysis ---")
    print(f"  Average length: {sum(entity_lengths)/len(entity_lengths):.2f}")
    print(f"  Max length: {max(entity_lengths)}, Min length: {min(entity_lengths)}")
    print(f"--- Start Position ---")
    print(f"  Max: {max(start_positions)}, Min: {min(start_positions)}")
    print(f"--- End Position ---")
    print(f"  Max: {max(end_positions)}, Min: {min(end_positions)}")

# Run for test.json and train.json
analyze_entity_lengths(test_data, "test.json")
analyze_entity_lengths(train_data, "train.json")


--- test.json Entity Length Analysis ---
  Average length: 8.50
  Max length: 50, Min length: 2
--- Start Position ---
  Max: 214, Min: 0
--- End Position ---
  Max: 218, Min: 2
--- train.json Entity Length Analysis ---
  Average length: 8.59
  Max length: 49, Min length: 1
--- Start Position ---
  Max: 231, Min: 0
--- End Position ---
  Max: 252, Min: 2


In [16]:
from collections import Counter

# Analyze label and type distributions
def analyze_label_type_distribution(data, dataset_name):
    label_counter = Counter()
    type_counter = Counter()
    for entry in data:
        for entity in entry['entities']:
            label_counter[entity.get('label', 'Unknown')] += 1
            type_counter[entity.get('type', 'Unknown')] += 1
    print(f"--- {dataset_name} Label Distribution ---")
    print(label_counter)
    print(f"--- {dataset_name} Type Distribution ---")
    print(type_counter)

# Run for test.json and train.json
analyze_label_type_distribution(test_data, "test.json")
analyze_label_type_distribution(train_data, "train.json")


--- test.json Label Distribution ---
Counter({'Non-hallucinated': 583, 'Non-factual Hallucination': 134, 'Factual Hallucination': 118, 'Intrinsic Hallucination': 8})
--- test.json Type Distribution ---
Counter({'PERSON': 264, 'GPE': 173, 'ORG': 125, 'DATE': 98, 'CARDINAL': 61, 'NORP': 47, 'ORDINAL': 19, 'LOC': 15, 'EVENT': 11, 'MONEY': 10, 'PERCENT': 5, 'TIME': 4, 'PRODUCT': 3, 'WORK_OF_ART': 2, 'QUANTITY': 2, 'Unknown': 2, 'FAC': 2})
--- train.json Label Distribution ---
Counter({'Non-hallucinated': 1093, 'Factual Hallucination': 270, 'Non-factual Hallucination': 230, 'Intrinsic Hallucination': 39})
--- train.json Type Distribution ---
Counter({'PERSON': 483, 'GPE': 355, 'ORG': 252, 'DATE': 177, 'CARDINAL': 106, 'NORP': 99, 'ORDINAL': 42, 'LOC': 28, 'MONEY': 24, 'FAC': 21, 'EVENT': 15, 'WORK_OF_ART': 10, 'TIME': 8, 'QUANTITY': 4, 'PRODUCT': 4, 'PERCENT': 2, 'LAW': 2})
