# Tokenization, Stopwords, punctuation, Lowercasing, Stemming

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from string import punctuation

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Read the corpus from the file
with open("corpus.txt", "r", encoding="utf-8") as file:
    corpus = file.read()

# Tokenization
tokens = word_tokenize(corpus)

# Lowercasing
tokens = [token.lower() for token in tokens]

# Removing punctuation
tokens = [token for token in tokens if token not in punctuation]

# Removing stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

# Lemmatization (optional)
# Lemmatization requires POS tagging
# You may need to install the averaged_perceptron_tagger resource using nltk.download()


print("Preprocessed tokens:", stemmed_tokens)


[nltk_data] Downloading package punkt to /home/ryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessed tokens: ['sitemap', 'arizona', 'depart', 'child', 'safeti', 'skip', 'main', 'content', 'arizona', 'depart', 'child', 'safeti', '0', 'home', 'aboutabout', 'dc', 'administrationdavid', 'lujan', 'dc', 'ceo', 'engag', 'field', 'offic', 'polici', 'proceduredc', 'polici', 'procedur', 'dc', 'rule', 'rulemak', 'dc', 'equal', 'employ', 'opportun', 'polici', 'dc', 'polici', 'dc', 'limit', 'english', 'profici', 'polici', 'procur', 'contract', 'strateg', 'plan', 'commun', 'advisori', 'committe', 'volunteerget', 'involv', 'commun', 'volunt', 'interest', 'form', 'give', 'tree', 'luggag', 'love', 'commun', 'screen', 'partner', 'fact', 'program', 'contact', 'us', 'careerscar', 'opportunitieschild', 'safeti', 'specialist', 'child', 'welfar', 'investig', 'specialist', 'ocwi', 'search', 'posit', 'compassion', 'news', 'reportsnew', 'releasesquarterli', 'newslett', 'dc', 'report', 'perform', 'measuresdc', 'monthli', 'report', 'child', 'fatal', 'near', 'fatal', 'process', 'releas', 'info', 'set

# part-of-speech (POS) tagging

In [2]:
from nltk import pos_tag
tokens_with_pos = pos_tag(tokens)
print(tokens_with_pos)

[('sitemap', 'NN'), ('arizona', 'NN'), ('department', 'NN'), ('child', 'NN'), ('safety', 'NN'), ('skip', 'NN'), ('main', 'JJ'), ('content', 'NN'), ('arizona', 'NN'), ('department', 'NN'), ('child', 'NN'), ('safety', 'NN'), ('0', 'CD'), ('home', 'NN'), ('aboutabout', 'NN'), ('dcs', 'NN'), ('administrationdavid', 'NN'), ('lujan', 'NN'), ('dcs', 'NN'), ('ceo', 'JJ'), ('engage', 'NN'), ('field', 'NN'), ('offices', 'NNS'), ('policy', 'NN'), ('proceduredcs', 'NN'), ('policy', 'NN'), ('procedure', 'NN'), ('dcs', 'NN'), ('rules', 'NNS'), ('rulemaking', 'VBG'), ('dcs', 'NN'), ('equal', 'JJ'), ('employment', 'NN'), ('opportunity', 'NN'), ('policy', 'NN'), ('dcs', 'NN'), ('policy', 'NN'), ('dcs', 'NN'), ('limited', 'VBD'), ('english', 'JJ'), ('proficiency', 'NN'), ('policy', 'NN'), ('procurement', 'NN'), ('contracts', 'NNS'), ('strategic', 'JJ'), ('plan', 'NN'), ('community', 'NN'), ('advisory', 'JJ'), ('committee', 'NN'), ('volunteerget', 'NN'), ('involved', 'VBN'), ('community', 'NN'), ('volunt

# Lemmitization

In [3]:
from nltk.stem import WordNetLemmatizer

# Create an instance of WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define a mapping from NLTK POS tags to WordNet POS tags
pos_mapping = {
    'N': 'n',  # Noun
    'V': 'v',  # Verb
    'R': 'r',  # Adverb
    'J': 'a'   # Adjective
}

# Lemmatize each token with its corresponding POS tag
lemmatized_tokens = []
for token, pos in tokens_with_pos:
    wordnet_pos = pos_mapping.get(pos[0], 'n')  # Default to 'n' if not found
    lemmatized_token = lemmatizer.lemmatize(token, pos=wordnet_pos)
    lemmatized_tokens.append(lemmatized_token)

print(lemmatized_tokens)


['sitemap', 'arizona', 'department', 'child', 'safety', 'skip', 'main', 'content', 'arizona', 'department', 'child', 'safety', '0', 'home', 'aboutabout', 'dc', 'administrationdavid', 'lujan', 'dc', 'ceo', 'engage', 'field', 'office', 'policy', 'proceduredcs', 'policy', 'procedure', 'dc', 'rule', 'rulemaking', 'dc', 'equal', 'employment', 'opportunity', 'policy', 'dc', 'policy', 'dc', 'limit', 'english', 'proficiency', 'policy', 'procurement', 'contract', 'strategic', 'plan', 'community', 'advisory', 'committee', 'volunteerget', 'involve', 'community', 'volunteer', 'interest', 'form', 'give', 'tree', 'luggage', 'love', 'community', 'screen', 'partner', 'fact', 'program', 'contact', 'u', 'careerscareer', 'opportunitieschild', 'safety', 'specialist', 'child', 'welfare', 'investigative', 'specialist', 'ocwi', 'search', 'position', 'compassioneers', 'news', 'reportsnews', 'releasesquarterly', 'newsletter', 'dcs', 'report', 'performance', 'measuresdcs', 'monthly', 'report', 'child', 'fatalit

In [7]:
from transformers import TextDataset
import os

# Assuming 'stemmed_tokens' contains your preprocessed tokens
# Convert the list of tokens into a single string
tokenized_text = ' '.join(stemmed_tokens)

# Define the file path
file_path = "corpus.txt"  # You can change the file name if needed

# Save the tokenized text to the file
with open(file_path, "w", encoding="utf-8") as file:
    file.write(tokenized_text)

# Check if the file was created successfully
if os.path.exists(file_path):
    print("Tokenized corpus saved to:", file_path)
else:
    print("Failed to save tokenized corpus.")

# Specify the block size
block_size = 128  # Adjust the block size as needed

# Create a TextDataset from the tokenized corpus
dataset = TextDataset(file_path=file_path, block_size=block_size)


Tokenized corpus saved to: corpus.txt


TypeError: TextDataset.__init__() missing 1 required positional argument: 'tokenizer'