In [1]:
import pandas as pd
import nltk
import time
from nltk.tokenize import word_tokenize
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/noam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def tokenize_nltk(text):
    return word_tokenize(text)


def tokenize_spacy(text):
    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)
    tokens = tokenizer(text)

    return list(tokens)


In [4]:
def analyze_tokenization_time(df, tokenizer, tokenizer_name):
    start_time = time.time()
    col_name = f'{tokenizer_name}_tokens'
    df[col_name] = df['v2'].apply(tokenizer)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Tokenization with {tokenizer_name} took {elapsed_time:.4f} seconds")


In [5]:
def load_data_and_tokenize(csv_file_path):
    df = pd.read_csv(csv_file_path, encoding='latin-1')
    df.columns = ['v1', 'v2', 'v3', 'v4', 'v5']

    analyze_tokenization_time(df, tokenize_nltk, 'nltk')

    analyze_tokenization_time(df, tokenize_spacy, 'spaCy')
    df.to_csv('spam_tokenized.csv')

NLTK Tokenization:

    The output of the NLTK tokenization is a list of words for each sentence, it is a simple tokenization process that splits the text by space and punctuation, for example it will split "I'm" into "I" and "'m".
    The proccessing speed is very fast, in this case it took 0.3 seconds. 
    it suppport various languages.
    The complexity for tokenizing each row is 𝑂(𝑛) therefore tokenizing the entire file would be 𝑂(𝑚⋅𝑛), where 𝑛 is the average length of the text and 𝑚 is the number of rows.
    
spaCy Tokenization:

    The output of the spaCy tokenization is a list of tokens for each sentence, it is a more complex tokenization process that takes into account the context of the words, for example it will not split "I'm" into "I" and "'m" and will tokenize it into "I'm".

In [6]:
load_data_and_tokenize('spam.csv')

Tokenization with nltk took 0.3683 seconds
Tokenization with spaCy took 306.5064 seconds
