# Read Reuters XML news data

This code must be run in Python 3. It reads a Reuters news data set, which consists of a set of files containing articles and topic labels. The data can be used to train and test a classifier that categorizes articles by topic.

The code extracts the article text and topic (for simplicity, excluding articles with no or multiple topics) and outputs a list of dictionaries in JSON format.

The XML-formated data originates from: https://archive.ics.uci.edu/ml/datasets/reuters-21578+text+categorization+collection

In [None]:
import xml.etree.ElementTree as et
import re
import os

def get_text(element):
    """ Get text within tag and within nested tags """
    return ((element.text or '') + ''.join(map(get_text, element)) + (element.tail or ''))

def read_news(filename):
    """ Extract news article texts and topic labels from Reuters XML file """
    f = open(filename, encoding="latin-1")
    output = []
    while True: # Loop through file
        buffer = ""
        while True: # Loop through article, fill buffer
            line = f.readline()
            buffer += line
            if "</REUTERS>" in line:
                break
            if not line: # End of file
                return output
        buffer = buffer.replace("&", "&amp;") # Fix XML
        root = et.fromstring(buffer) # Parse XML
        topic_tag = root.find('TOPICS')
        if len(topic_tag) != 1: # Extract only articles with exactly one topic label, for simplicity
            continue
        topic = get_text(topic_tag).strip()
        text = re.sub("\n\s+", "\n", get_text(root.find('TEXT')).strip())
        output.append({'class': topic, 'text': text}) # Save as JSON entry


In [None]:
data = []
path = "data/"
# Read all XML (sgm) files in directory
for filename in os.listdir(path):
    if '.sgm' in filename:
        print("Reading", filename)
        data += read_news(path+filename)

## Below is some code for inspecting and cleaning the data further

In [None]:
# Check number of articles
len(data)

In [None]:
# Check number of classes
len(set([x['class'] for x in data]))

In [None]:
# Check number of articles per class
import collections
counter = collections.defaultdict(lambda: 0)
for datum in data:
    counter[datum['class']] += 1
for topic, count in sorted(counter.items(), key=lambda x:x[1], reverse=True):
    print(count, topic)
    

In [None]:
# Filter out classes with less than 5 occurrences
data = [d for d in data if counter[d['class']] >= 5]


In [None]:
# Check length
len(data)

In [None]:
# Check number of classes
len(set([x['class'] for x in data]))