# Text preprocessing

In [1]:
# Assign the content of the raw text to a variable to be used and analyzed
raw_text = open("tram_raw.txt").read()

In [2]:
# Import necessary modules
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import regex as re

### Sentences and tokens

In [3]:
# Cut text into sentences
sentences = sent_tokenize(raw_text)

# Tokenize entire text
tokens = word_tokenize(raw_text)

# Unique tokens - entire documents
unique_tokens = set(tokens)

# Print the unique tokens + count of unique tokens
print(unique_tokens, "\n", len(unique_tokens))

 3504


### Creating a bag-of-words including stop-words

In [4]:
# Import Counter - class from a standard python library
from collections import Counter

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [str.lower(t) for t in tokens]

# Create a Counter with the lowercase tokens
bow_text = Counter(lower_tokens)

# Print the 20 most common tokens
print(bow_text.most_common(20))

[('the', 800), (',', 763), ('in', 560), ('.', 532), ('and', 315), ('of', 304), ('tram', 259), ('a', 236), ('to', 217), ('[', 189), (']', 189), ('(', 182), (')', 182), ('was', 113), ('as', 107), ('trams', 102), ('on', 99), ('by', 93), ('system', 90), ('for', 89)]


### Excluding stop words

In [5]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Retain alphabetic words
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Set English stop words to be removed
stop_words = set(stopwords.words("english"))

# Remove all stop-words
stop_w_removed = [t for t in alpha_only if t not in stop_words]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in stop_w_removed]

# Create the bag-of-words without stop-words
bow_stop_w_removed = Counter(lemmatized)

# Print the 20 most common tokens
print(bow_stop_w_removed.most_common(20))

[('tram', 361), ('system', 173), ('line', 98), ('tramway', 84), ('rail', 82), ('city', 82), ('streetcar', 63), ('track', 55), ('km', 54), ('mi', 53), ('cable', 53), ('network', 44), ('first', 44), ('new', 44), ('railway', 43), ('car', 43), ('light', 41), ('service', 41), ('electric', 39), ('used', 38)]
