# Parsing of a .txt file
## Naive Implementation

In [1]:
# reading line to line
txt_naive = ""
# example txt from reuters corpus
with open('files/Reut_1.txt') as file:
    lines = file.readlines()
    for line in lines:
        txt_naive += line
print(txt_naive)

bahia cocoa review 13 showers continued throughout the week in the bahia cocoa zone alleviating the drought since early january and improving prospects for the coming temporao although normal humidity levels have not been restored comissaria smith said in its weekly review the dry period means the temporao will be late this year arrivals for the week ended february 22 were 155 221 bags of 60 kilos making a cumulative total for the season of 5 93 mln against 5 81 at the same stage last year again it seems that cocoa delivered earlier on consignment was included in the arrivals figures comissaria smith said there is still some doubt as to how much old crop cocoa is still available as harvesting has practically come to an end with total bahia crop estimates around 6 4 mln bags and sales standing at almost 6 2 mln there are a few hundred thousand bags still in the hands of farmers middlemen exporters and processors there are doubts as to how much of this cocoa would be fit for export as sh

## More advanced variation

In [2]:
# some basic parsing
import re

# remove unnecessary punctuations such as ",", "|", etc.
txt_naive = txt_naive\
    .replace(":", ".")\
    .replace(".", ")")\
    .replace(")", "(")\
    .replace("(", "-")\
    .replace("-", "•")\
    .replace("•", "|")\
    .replace("|", ";")\
    .replace(";", ",")\
    .replace(",", "")
# remove any numbers
txt_naive = re.sub('\d+', ' ', txt_naive)
# turn multi spaces into single spaces
txt_naive = re.sub('\s+', ' ', txt_naive)

tokens = txt_naive.split(" ")

print(tokens)

['bahia', 'cocoa', 'review', 'showers', 'continued', 'throughout', 'the', 'week', 'in', 'the', 'bahia', 'cocoa', 'zone', 'alleviating', 'the', 'drought', 'since', 'early', 'january', 'and', 'improving', 'prospects', 'for', 'the', 'coming', 'temporao', 'although', 'normal', 'humidity', 'levels', 'have', 'not', 'been', 'restored', 'comissaria', 'smith', 'said', 'in', 'its', 'weekly', 'review', 'the', 'dry', 'period', 'means', 'the', 'temporao', 'will', 'be', 'late', 'this', 'year', 'arrivals', 'for', 'the', 'week', 'ended', 'february', 'were', 'bags', 'of', 'kilos', 'making', 'a', 'cumulative', 'total', 'for', 'the', 'season', 'of', 'mln', 'against', 'at', 'the', 'same', 'stage', 'last', 'year', 'again', 'it', 'seems', 'that', 'cocoa', 'delivered', 'earlier', 'on', 'consignment', 'was', 'included', 'in', 'the', 'arrivals', 'figures', 'comissaria', 'smith', 'said', 'there', 'is', 'still', 'some', 'doubt', 'as', 'to', 'how', 'much', 'old', 'crop', 'cocoa', 'is', 'still', 'available', 'as',

Applying Porter Stemmer:

In [3]:
from nltk.stem import PorterStemmer

# apply stemming on every word
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

print(stemmed_tokens)

['bahia', 'cocoa', 'review', 'shower', 'continu', 'throughout', 'the', 'week', 'in', 'the', 'bahia', 'cocoa', 'zone', 'allevi', 'the', 'drought', 'sinc', 'earli', 'januari', 'and', 'improv', 'prospect', 'for', 'the', 'come', 'temporao', 'although', 'normal', 'humid', 'level', 'have', 'not', 'been', 'restor', 'comissaria', 'smith', 'said', 'in', 'it', 'weekli', 'review', 'the', 'dri', 'period', 'mean', 'the', 'temporao', 'will', 'be', 'late', 'thi', 'year', 'arriv', 'for', 'the', 'week', 'end', 'februari', 'were', 'bag', 'of', 'kilo', 'make', 'a', 'cumul', 'total', 'for', 'the', 'season', 'of', 'mln', 'against', 'at', 'the', 'same', 'stage', 'last', 'year', 'again', 'it', 'seem', 'that', 'cocoa', 'deliv', 'earlier', 'on', 'consign', 'wa', 'includ', 'in', 'the', 'arriv', 'figur', 'comissaria', 'smith', 'said', 'there', 'is', 'still', 'some', 'doubt', 'as', 'to', 'how', 'much', 'old', 'crop', 'cocoa', 'is', 'still', 'avail', 'as', 'harvest', 'ha', 'practic', 'come', 'to', 'an', 'end', 'wi

Removing stopwords:

In [4]:
import nltk
from nltk.corpus import stopwords
# update/download database if necessary
nltk.download('stopwords')

# filter stop words
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in stemmed_tokens if word not in stop_words]

print(str(filtered_words) + "\n")
print(str(len(filtered_words)) + " (filtered) vs. " + str(len(stemmed_tokens)) + " (unfiltered)")

['bahia', 'cocoa', 'review', 'shower', 'continu', 'throughout', 'week', 'bahia', 'cocoa', 'zone', 'allevi', 'drought', 'sinc', 'earli', 'januari', 'improv', 'prospect', 'come', 'temporao', 'although', 'normal', 'humid', 'level', 'restor', 'comissaria', 'smith', 'said', 'weekli', 'review', 'dri', 'period', 'mean', 'temporao', 'late', 'thi', 'year', 'arriv', 'week', 'end', 'februari', 'bag', 'kilo', 'make', 'cumul', 'total', 'season', 'mln', 'stage', 'last', 'year', 'seem', 'cocoa', 'deliv', 'earlier', 'consign', 'wa', 'includ', 'arriv', 'figur', 'comissaria', 'smith', 'said', 'still', 'doubt', 'much', 'old', 'crop', 'cocoa', 'still', 'avail', 'harvest', 'ha', 'practic', 'come', 'end', 'total', 'bahia', 'crop', 'estim', 'around', 'mln', 'bag', 'sale', 'stand', 'almost', 'mln', 'hundr', 'thousand', 'bag', 'still', 'hand', 'farmer', 'middlemen', 'export', 'processor', 'doubt', 'much', 'thi', 'cocoa', 'would', 'fit', 'export', 'shipper', 'experienc', 'dificulti', 'obtain', 'bahia', 'superio

[nltk_data] Downloading package stopwords to /home/av11/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
