In [1]:
!pip install pandas textblob nltk spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m52.5 MB/s[0m  [33m0:00:00[0m[31m56.8 MB/s[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import pandas as pd
import time
import re
import os
from collections import Counter
from textblob import TextBlob
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import spacy

nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
spacy_nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt_tab to /Users/win/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/win/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean(text):
    # convert to lowercase
    text = text.lower()
    # replace new line and tab with space
    text = text.replace('\n', ' ').replace('\t', ' ')
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def process_textblob(text):
    # Create blob from raw text so there still . , for tokenizing sentence
    blob_raw = TextBlob(text)

    # Tokenize sentences
    sentences = [str(s) for s in blob_raw.sentences]
    # Clean sentences
    cleaned_sentences = [clean(s) for s in sentences]
    
    # Create blob form cleaned text to tokenize word
    cleaned_text = clean(text)
    # Create blob from cleaned text
    blob_cleaned = TextBlob(cleaned_text)

    # Filter stop word
    filtered_words = [w for w in blob_cleaned.words if w not in stop_words]
    
    # Top Word
    top_words = Counter(filtered_words).most_common(10)
    
    return cleaned_text, cleaned_sentences, filtered_words, top_words

def process_nltk(text):
    # Tokenize sentence
    sentences = nltk.sent_tokenize(text)

    # Clean sentence
    cleaned_sentence = [clean(s) for s in sentences]

    # Clean text
    cleaned_text = clean(text)

    # Tokenize words
    words = nltk.word_tokenize(cleaned_text)
    filtered_words = [w for w in words if w not in stop_words]

    top_words = Counter(filtered_words).most_common(10)

    return cleaned_text, cleaned_sentence, filtered_words, top_words

def process_spacy(text):
    # run spacy on raw text to keep . , for sentences tokenization
    spacy_raw = spacy_nlp(text)

    # Tokenize sentences
    sentences = [s.text.strip() for s in spacy_raw.sents]
    # Clean sentences
    cleaned_sentences = [clean(s) for s in sentences]

    cleaned_text = clean(text)
    spacy_cleand = spacy_nlp(cleaned_text)
    # Tokenize words
    filtered_words = [w.text.strip() for w in spacy_cleand if w not in stop_words]

    top_words = Counter(filtered_words).most_common(10)

    return cleaned_text, sentences, filtered_words, top_words

In [4]:
def save_reports(strategy_name, clean_text, sentences, words, top_words, elapsed):
    # Create output directory
    output_dir = "output"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save cleaned text
    with open(f"{output_dir}/cleaned_{strategy_name}.txt", "w", encoding="utf-8") as f:
        f.write(clean_text)

    # Save tokenize sentence and word with their count
    with open(f"{output_dir}/words_{strategy_name}.txt", "w", encoding="utf-8") as f:
        f.write(f"----Tokenized Sentences ({len(sentences)})----\n")
        f.write("\n".join(f"{s}" for s in  sentences))
        f.write(f"----\n\nTokenized Words ({len(words)})----\n")
        f.write("\n".join(words))

    # convert tuple to dataframe and save to textfile
    df_top = pd.DataFrame(top_words, columns=["Word", "Count"])
    with open(f"{output_dir}/top10words_{strategy_name}.txt", "w", encoding="utf-8") as f:
        f.write(df_top.to_string(index=False))

    time_file = f"{output_dir}/time_compares.txt"
    
    # create data frame for the current run
    new_row = {"Strategy": strategy_name, "Time(s)": round(elapsed, 6)}
    df_new = pd.DataFrame([new_row])
    
    # check if time compare file already exist or not
    if os.path.exists(time_file):
        # append new row to the old one if the file exist
        df_existing = pd.read_csv(time_file)
        df_final = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        # if not then it just the row
        df_final = df_new
    
    # save the file with out row number
    df_final.to_csv(time_file, index=False)

In [5]:
# Strategy pattern at home
strategies = [
    (process_textblob, "TextBlob"),
    (process_nltk, "NLTK"),
    (process_spacy, "spaCy")
]

# Remove previous timecompared.txt to prevent the result of new run to be append to the old file
time_file_path = "output/time_compares.txt"
if os.path.exists(time_file_path):
    os.remove(time_file_path)

input_file = "resource/alice29.txt"
with open(input_file, 'r', encoding='utf-8') as f:
    raw_text = f.read()

# loop through each strategy 
for strategy_func, name in strategies:
    start_time = time.time()
    
    # run the strategy
    cleaned_text, sentences, final_words_str, top = strategy_func(raw_text)
    
    end_time = time.time()
    # record run time
    elapsed = end_time - start_time
    
    # create report for each strategies
    save_reports(name, cleaned_text, sentences, final_words_str, top, elapsed)