In [58]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import time
from nltk.stem import WordNetLemmatizer
import spacy


# *Statistics:*

In [59]:
def load_data(csv_file_path):
    df = pd.read_csv(csv_file_path, encoding='latin-1')
    df.columns = ['v1', 'v2', 'v3', 'v4', 'v5']
    return df

In [60]:
def print_basic_statistics(df: pd.DataFrame):

    total_messages = df.shape[0]

    spam_count = df[df['v1'] == 'spam'].shape[0]
    ham_count = df[df['v1'] == 'ham'].shape[0]

    df['word_count'] = df['v2'].apply(lambda x: len(x.split()))
    avg_words_per_message = df['word_count'].mean()

    all_words = ' '.join(df['v2']).split()
    most_common_words = Counter(all_words).most_common(5)

    word_counts = Counter(all_words)
    words_once = sum(1 for count in word_counts.values() if count == 1)

    print(f'Total number of SMS messages: {total_messages}')
    print(f'Number of spam messages: {spam_count}')
    print(f'Number of ham messages: {ham_count}')
    print(f'Average number of words per message: {avg_words_per_message:.2f}')
    print('5 most frequent words:', most_common_words)
    print(f'Number of words that only appear once: {words_once}')


# *NLP Tokenization and Lemmatization:*


In [61]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to /Users/noam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/noam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/noam/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# *NLTK:*

In [62]:
def tokenize_nltk(text):
    return word_tokenize(text)

def lemmatize_nltk(text):
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(token) for token in tokens]


# *spaCy:*

In [63]:
def tokenize_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc]

def lemmatize_spacy(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]


# *Time Analysis:*

In [64]:
def analyze_tokenization_time(df, tokenizer, tokenizer_name):
    start_time = time.time()
    col_name = f'{tokenizer_name}_tokens'
    df[col_name] = df['v2'].apply(tokenizer)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Tokenization with {tokenizer_name} took {elapsed_time:.4f} seconds")


In [65]:
def analyze_lemmatization_time(df, lemmatization, lemmatization_name):
    start_time = time.time()
    col_name = f'{lemmatization_name}_lemmatization'
    df[col_name] = df['v2'].apply(lemmatization)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Lemmatization_name with {lemmatization_name} took {elapsed_time:.4f} seconds")


In [66]:
def load_data_and_tokenize(df: pd.DataFrame):
    analyze_tokenization_time(df, tokenize_nltk, 'nltk')
    analyze_tokenization_time(df, tokenize_spacy, 'spaCy')
    df.to_csv('spam_tokenized.csv')

In [67]:
def load_data_and_lemmatization(df: pd.DataFrame):
    analyze_lemmatization_time(df, lemmatize_nltk, 'nltk')
    analyze_lemmatization_time(df, lemmatize_spacy, 'spaCy')
    df.to_csv('spam_lemmatize.csv')

### *NLTK Analyze:*

    The output of the NLTK tokenization is a list of tokens for each sentence, it is a simple tokenization process that splits the text by space and punctuation, for example it will split "I'm" into "I" and "'m".
    The output of the NLTK lemmatization is a list of lemmatized tokens for each sentence, it is a simple lemmatization process that reduces the words to their base form, In nltk it didnt success to reduce the word to its form, for example "searching" stay "searching", It didnt preform well.
    The proccessing speed is very fast, in this case it took 0.3 seconds. 
    It is primarily designed for english.
    The complexity for tokenizing each row is 𝑂(𝑛) therefore tokenizing the entire file would be 𝑂(𝑚⋅𝑛), where 𝑚 is the average length of the text and 𝑛 is the number of rows.
    
### *spaCy Analyze:*

    The output of the spaCy tokenization is a list of tokens for each sentence, it is a more complex tokenization process that takes into account the context of the words, for example it will not split "I'm" into "I" and "'m" and will tokenize it into "I'm".
    The output of the spaCy lemmatization is a list of lemmatized tokens for each sentence, it is a simple lemmatization process that reduces the words to their base form, for example it will reduce "running" to "run".
    The proccessing speed is slower than NLTK, in this case it took 313 seconds.
    it suppport various languages.
    The complexity for tokenizing each row is 𝑂(𝑛) therefore tokenizing the entire file would be 𝑂(𝑚⋅𝑛), where 𝑚 is the average length of the text and 𝑛 is the number of rows.

In [68]:
spam_df = load_data('spam.csv')

In [69]:
print_basic_statistics(spam_df)

Total number of SMS messages: 5572
Number of spam messages: 747
Number of ham messages: 4825
Average number of words per message: 15.49
5 most frequent words: [('to', 2134), ('you', 1622), ('I', 1466), ('a', 1327), ('the', 1197)]
Number of words that only appear once: 9268


In [70]:
load_data_and_tokenize(spam_df)

Tokenization with nltk took 0.3706 seconds
Tokenization with spaCy took 29.2516 seconds


In [71]:
load_data_and_lemmatization(spam_df)

Lemmatization_name with nltk took 0.5819 seconds
Lemmatization_name with spaCy took 28.2079 seconds
