In [31]:
import pandas as pd
import spacy
import nltk 
from nltk.corpus import stopwords

In [43]:
nlp = spacy.load('en_core_web_sm')

STOPWORDS = stopwords.words('english')

debate_2016_csv = '2016uspresdebate.csv'

# 'latin1', 'iso-8859-1', or 'cp1252'. Pandas reads files using utf-8 as default encoding
df_csv = pd.read_csv(debate_2016_csv, encoding='latin1') 
text_column = df_csv['Text']
text = text_column.tolist()

In [53]:
porter_stemmer = nltk.PorterStemmer() 

def is_number(word):
    try:
        float(word)
        return True
    except ValueError:
        return False

processed_data = []
for sentence in text_column:
    if isinstance(sentence, list):
        sentence = ' '.join(sentence)

    # Normalize
    normalized_words = sentence.lower().split(" ")

    for word in normalized_words:
        # Skip Stop Words
        if word in STOPWORDS or is_number(word):
            continue

        # Tokenize 
        tokenized_word = nltk.word_tokenize(word)

        # Stem using NLTK's PorterStemmer
        root = porter_stemmer.stem(word)

        # Lemmatize using spaCy
        doc = nlp(word)
        lemmatized_word = doc[0].lemma_ if doc else word

        processed_data.append({
            'Original / Normalized' : word,
            'Tokenized' : tokenized_word,
            'Stemmed' : root,
            'Lemmatized' : lemmatized_word
        })


In [54]:
processed_df = pd.DataFrame(processed_data)

output_file = 'Preprocessing Dataset (No stop words and numbers).xlsx'
processed_df.to_excel(output_file, index=False)

print(f"Processed words have been written to {output_file}")


Processed words have been written to Tan, Robien Lee - Preprocessing Dataset (No stop words and numbers).xlsx


In [52]:
# Template for preprocessing
print("Template for preprocessing")

sentence = ("Universal we're recognition become being happened")
tokenized_words = nltk.word_tokenize(sentence)
normalized_list = sentence.lower().split(" ")

print("Tokenized words: ", tokenized_words)
print("Normalized words: ", normalized_list)

# stemming using NLTK
porter_stemmer = nltk.PorterStemmer()
roots = [porter_stemmer.stem(word) for word in normalized_list]

print("Roots (Stemming)", roots)

# lemmatization using spaCy
doc = nlp(sentence)
lemmatized_words = [token.lemma_ for token in doc]

print("Lemmatized words: ", lemmatized_words)

Template for preprocessing
Tokenized words:  ['Universal', 'we', "'re", 'recognition', 'become', 'being', 'happened']
Normalized words:  ['universal', "we're", 'recognition', 'become', 'being', 'happened']
Roots (Stemming) ['univers', "we'r", 'recognit', 'becom', 'be', 'happen']
Lemmatized words:  ['Universal', 'we', 'be', 'recognition', 'become', 'be', 'happen']
