# Read Reuters XML news data

This code must be run in Python 3. It reads a Reuters news data set, which consists of a set of files containing articles and topic labels. The data can be used to train and test a classifier that categorizes articles by topic.

The code extracts the article text and topic (for simplicity, excluding articles with no or multiple topics) and outputs a list of dictionaries in JSON format.

The XML-formated data originates from: https://archive.ics.uci.edu/ml/datasets/reuters-21578+text+categorization+collection

In [1]:
import xml.etree.ElementTree as et
import re
import os

def get_text(element):
    """ Get text within tag and within nested tags """
    return ((element.text or '') + ''.join(map(get_text, element)) + (element.tail or ''))

def read_news(filename):
    """ Extract news article texts and topic labels from Reuters XML file """
    f = open(filename, encoding="latin-1")
    output = []
    while True: # Loop through file
        buffer = ""
        while True: # Loop through article, fill buffer
            line = f.readline()
            buffer += line
            if "</REUTERS>" in line:
                break
            if not line: # End of file
                return output
        buffer = buffer.replace("&", "&amp;") # Fix XML
        root = et.fromstring(buffer) # Parse XML
        topic_tag = root.find('TOPICS')
        if len(topic_tag) != 1: # Extract only articles with exactly one topic label, for simplicity
            continue
        topic = get_text(topic_tag).strip()
        text = re.sub("\n\s+", "\n", get_text(root.find('TEXT')).strip())
        output.append({'class': topic, 'text': text}) # Save as JSON entry


In [5]:
data = []
path = "data/reuters"
# Read all XML (sgm) files in directory
for filename in os.listdir(path):
    if '.sgm' in filename:
        print("Reading", filename)
        data += read_news(os.path.join(path,filename))

Reading reut2-005.sgm
Reading reut2-016.sgm
Reading reut2-007.sgm
Reading reut2-003.sgm
Reading reut2-020.sgm
Reading reut2-019.sgm
Reading reut2-001.sgm
Reading reut2-021.sgm
Reading reut2-013.sgm
Reading reut2-004.sgm
Reading reut2-011.sgm
Reading reut2-008.sgm
Reading reut2-018.sgm
Reading reut2-012.sgm
Reading reut2-000.sgm
Reading reut2-014.sgm
Reading reut2-002.sgm
Reading reut2-009.sgm
Reading reut2-015.sgm
Reading reut2-017.sgm
Reading reut2-006.sgm
Reading reut2-010.sgm


## Below is some code for inspecting and cleaning the data further

In [6]:
# Check number of articles
len(data)

9494

In [7]:
# Check number of classes
len(set([x['class'] for x in data]))

66

In [8]:
# Check number of articles per class
import collections
counter = collections.defaultdict(lambda: 0)
for datum in data:
    counter[datum['class']] += 1
for topic, count in sorted(counter.items(), key=lambda x:x[1], reverse=True):
    print(count, topic)
    

3945 earn
2362 acq
408 crude
361 trade
307 money-fx
285 interest
161 money-supply
158 ship
143 sugar
116 coffee
99 gold
83 gnp
79 cpi
63 cocoa
55 jobs
54 copper
53 reserves
51 grain
50 alum
49 ipi
47 iron-steel
45 nat-gas
41 rubber
37 veg-oil
32 bop
30 tin
26 cotton
26 wpi
22 retail
22 orange
22 gas
21 pet-chem
20 livestock
19 strategic-metal
18 housing
16 zinc
14 lei
14 heat
12 income
12 lumber
11 fuel
11 carcass
11 silver
9 oilseed
8 lead
7 instal-debt
7 meal-feed
6 tea
6 dlr
6 yen
5 potato
4 nickel
4 cpu
4 jet
3 inventories
3 platinum
2 groundnut
1 l-cattle
1 wool
1 hog
1 rand
1 rice
1 stg
1 propane
1 naphtha
1 coconut


In [9]:
# Filter out classes with less than 5 occurrences
data = [d for d in data if counter[d['class']] >= 5]


In [10]:
# Check length
len(data)

9465

In [11]:
# Check number of classes
len(set([x['class'] for x in data]))

51

In [14]:
import json
with open("data/reuters_51cls.json","w") as f:
    json.dump(data,f,indent=2)