# AIG 230 - Assignment 5
## Tatum Soward

## Part A - Text Preprocessing

In [2]:
#Installing NLTK
!pip install -q nltk

In [3]:
#Importing necessary libraries
import nltk
import numpy as np
import pandas as pd
import sklearn as sk
import gensim
import matplotlib.pyplot as plt

### A1 - Load the Corpus
I have chosen to use the NLTK Gutenberg `'austen-emma.txt'` as my corpus.

In [4]:
#Downloading the Gutenberg set from NLTK
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\sowar\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [5]:
#Importing Emma by Jane Austen from NLTK Gutenberg
from nltk.corpus import gutenberg
corpus = 'austen-emma.txt'
raw_txt = gutenberg.raw(corpus)

# Total number of characters
from nltk import RegexpTokenizer
tokenizer = RegexpTokenizer(r'.')
emma_chars = tokenizer.tokenize(raw_txt)
print(f"Number of Characters: {len(emma_chars)}")

# Total number of tokens (before preprocessing)
tokens = gutenberg.words(corpus)
print(f"Number of Tokens: {len(tokens)}")

Number of Characters: 887071
Number of Tokens: 192427


### A2 - Preprocess Function

In [None]:
# Function that preprocesses raw text
def preprocess(raw: str, stp_wrds: bool = False, stemlem: str = 'stem'):
    # raw -> the raw text of the corpus
    # stp_wrds -> T/F stop words in final tokens
    # stemlem -> choose stemming 'stem' or lemmatization 'lem' or neither
    # returns preprocessed list of tokens -> List[str]

    # Word tokenizing the raw text
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(raw)

    # Removing Punctuation
    import string
    tokens = [t for t in tokens if t not in string.punctuation]

    # Lowercase Normalization
    tokens = [t.lower() for t in tokens]

    #Stop Word Removal
    if stp_wrds: #if keeping stop words = True
        tokens = tokens
    else:
        from nltk.corpus import stopwords
        nltk.download("stopwords")
        stop_words = set(stopwords.words("english")) #assumes english text
        tokens = [t for t in tokens if t not in stop_words]

    # Stemming vs. Lemmatization
    if stemlem == 'stem':
        from nltk.stem import PorterStemmer
        ps = PorterStemmer()
        tokens = [ps.stem(t) for t in tokens]
    elif stemlem == 'lem':
        from nltk.stem import WordNetLemmatizer
        from nltk import pos_tag
        lem = WordNetLemmatizer()
        tagged_tokens = pos_tag(tokens) #getting the tags of the tokens
        def get_wordnet_pos(tag): # defining a f'n to get the proper word type per token
            if tag.startswith('J'):
                return 'a'
            elif tag.startswith('V'):
                return 'v'
            elif tag.startswith('N'):
                return 'n'
            elif tag.startswith('R'):
                return 'r'
            else:
                return 'n'
        tokens = [lem.lemmatize(t, get_wordnet_pos(tag)) for t, tag in tagged_tokens]
    else:
        tokens = tokens  

    return tokens

#### Statistics

In [19]:
# Do not keep stop words for bag of words, would keep stop words if we were summarizing the corpus
# Stemming, faster and better for count vector similarity
tokens = preprocess(raw_txt, stp_wrds=False, stemlem='stem') #clean tokens

#Total number of tokens
print(f"Total Number of Tokens: {len(tokens)}")

from collections import Counter
counts = Counter(tokens) #counts word frequencies of tokens

# Vocabulary
vocab = {w for w, c in counts.items()}
print("Vocabulary size:", len(vocab))

# 20 Most Frequent Tokens
print(f"20 Most Common Tokens: {counts.most_common(20)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sowar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total Number of Tokens: 81388
Vocabulary size: 5216
20 Most Common Tokens: [('--', 3100), ("''", 2452), ('``', 1735), ('mr.', 1091), ("'s", 928), ('emma', 860), ('could', 836), ('would', 818), ('mrs.', 668), ('miss', 611), ('must', 566), ('harriet', 500), ('much', 484), ('said', 483), ('think', 466), ('thing', 456), ('one', 451), ('weston', 445), ('everi', 435), ('elton', 405)]


### A3 - Reflection
In 5-8 sentences, explain how your preprocessing choices could affect downstream tasks (vectorization, language modeling, and embeddings).