# Code to Process LOCNESS Corpus data

In [105]:
import pandas as pd
import numpy as np
import os
import pickle
import re
import json

In [None]:
data_root = 'LOCNESS-corpus-files'
txt_files = [f for f in os.listdir(data_root) if f.endswith('.txt')]
txt_files.sort()

['BRSUR1.txt', 'BRSUR2.txt', 'BRSUR3.txt', 'USARG.txt', 'USMIXED.txt', 'alevels1.txt', 'alevels2.txt', 'alevels3.txt', 'alevels4.txt', 'alevels5.txt', 'alevels6.txt', 'alevels7.txt', 'alevels8.txt', 'alevels9.txt']


In [108]:
def pattern_map(file_name):
    if 'alevels' in file_name:
        if '1' in file_name:
            return r'(Transport \d{2})\s*(.*?)\s*(?=Transport|$)'
        if '2' in file_name:
            return r'(Parliamentary system \d{2})\s*(.*?)\s*(?=Parliamentary system|$)'
        if '3' in file_name:
            return r'(Fox hunting - FH\d{2})\s*(.*?)\s*(?=Fox hunting - FH|$)'
        if '4' in file_name:
            return r'(Boxing - B\d{2})\s*(.*?)\s*(?=Boxing - B|$)'
        if '5' in file_name:
            return r'(0\d{2})\s*(.*?)\s*(?=0\d{2})'
    return r'(<ICLE[^>]*>)\s*(.*?)\s*(?=<ICLE|$)'

In [None]:
papers = []
for file_name in txt_files:
    file_path = os.path.join(data_root, file_name)
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        content = file.read()
    # print(content)
    # pattern = re.compile(, re.DOTALL)
    pat = pattern_map(file_name)
    pattern = re.compile(pat, re.DOTALL)
    current_papers = [{"Filename": file_name, "essay_id": match.group(1), "Human": match.group(2) + '\n'} for match in pattern.finditer(content)]
    papers.extend(current_papers)
    
    n = len(current_papers)
    words = sum([len(paper['Human'].split()) for paper in current_papers])
json_data = json.dump(papers, open("LOCNESS.json", "w"), indent=4)


33 papers in BRSUR1.txt
59535 words in BRSUR1.txt

24 papers in BRSUR2.txt
17084 words in BRSUR2.txt

33 papers in BRSUR3.txt
18986 words in BRSUR3.txt

175 papers in USARG.txt
149520 words in USARG.txt

33 papers in USMIXED.txt
18626 words in USMIXED.txt

16 papers in alevels1.txt
7921 words in alevels1.txt

2 papers in alevels2.txt
1482 words in alevels2.txt

6 papers in alevels3.txt
2891 words in alevels3.txt

14 papers in alevels4.txt
7283 words in alevels4.txt

12 papers in alevels5.txt
7372 words in alevels5.txt

10 papers in alevels6.txt
4653 words in alevels6.txt

10 papers in alevels7.txt
5103 words in alevels7.txt

30 papers in alevels8.txt
15404 words in alevels8.txt

13 papers in alevels9.txt
6481 words in alevels9.txt

