In [211]:
import pandas as pd
import nltk
from collections import Counter
import time
from nltk.stem import WordNetLemmatizer
import spacy
from nltk.stem import PorterStemmer
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

## *All the statics are printed to the console and all the data manipulation of nlp are stored as csv and xlsx files*

## *Statistics:*

In [212]:
def load_data(csv_file_path):
    df = pd.read_csv(csv_file_path, encoding='latin-1')
    df.columns = ['v1', 'v2', 'v3', 'v4', 'v5']
    return df

In [213]:
def print_total_sms(df: pd.DataFrame):
    total_messages = df.shape[0]
    print(f'Total number of SMS messages: {total_messages}')
    
def print_spam_ham_ratio(df: pd.DataFrame):
    spam_count = df[df['v1'] == 'spam'].shape[0]
    ham_count = df[df['v1'] == 'ham'].shape[0]
    print(f'Spam to ham ratio: {spam_count / ham_count:.2f}')

def average_word_length(df: pd.DataFrame, column_name):
    df['word_length'] = df[column_name].apply(lambda x: len(x))
    avg_word_length = df['word_length'].mean()
    print(f'Average word length: {avg_word_length:.2f}')
    
def most_common_words(df: pd.DataFrame, column_name):
    all_words = ' '.join(df[column_name]).split()
    most_common = Counter(all_words).most_common(5)
    print('5 most frequent words:', most_common)

def number_of_words_once(df: pd.DataFrame, column_name):
    all_words = ' '.join(df[column_name]).split()
    word_counts = Counter(all_words)
    words_once = sum(1 for count in word_counts.values() if count == 1)
    print(f'Number of words that only appear once: {words_once}')

def print_basic_statistics(df: pd.DataFrame):
    print_total_sms(df)
    print_spam_ham_ratio(df)
    average_word_length(df, 'v2')
    most_common_words(df, 'v2')
    number_of_words_once(df, 'v2')

def print_statistics_after_applying_technique(df: pd.DataFrame, column_name):
    average_word_length(df, column_name)
    most_common_words(df, column_name)
    number_of_words_once(df, column_name)


# *NLP Tokenization and Lemmatization:*


In [214]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# *NLTK:*

In [215]:
def tokenize_nltk(text):
    return word_tokenize(text)

def lemmatize_nltk(text):
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(token) for token in tokens]

def stem_nltk(text):
    tokens = word_tokenize(text)
    return [stemmer.stem(token) for token in tokens]


# *spaCy:*

In [216]:
def tokenize_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc]

def lemmatize_spacy(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]


# *Time Analysis:*

In [217]:
def analyze_tokenization_time(df, tokenizer, tokenizer_name):
    start_time = time.time()
    col_name = f'{tokenizer_name}_tokens'
    df[col_name] = df['v2'].apply(tokenizer)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Tokenization with {tokenizer_name} took {elapsed_time:.4f} seconds")


In [218]:
def analyze_lemmatization_time(df, lemmatization, lemmatization_name):
    start_time = time.time()
    col_name = f'{lemmatization_name}_lemmatization'
    df[col_name] = df['v2'].apply(lemmatization)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Lemmatization with {lemmatization_name} took {elapsed_time:.4f} seconds")


In [219]:
def analyze_stemming_time(df, stemming, stemming_name):
    start_time = time.time()
    col_name = f'{stemming_name}_stemming'
    df[col_name] = df['v2'].apply(stemming)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Stemming with {stemming_name} took {elapsed_time:.4f} seconds")


In [220]:
def load_data_and_tokenize(df: pd.DataFrame):
    analyze_tokenization_time(df, tokenize_nltk, 'nltk')
    analyze_tokenization_time(df, tokenize_spacy, 'spaCy')
    printed_df = df.copy()
    printed_df['nltk_tokens'] = printed_df['nltk_tokens'].apply(lambda x: ' '.join(x))
    printed_df['spaCy_tokens'] = printed_df['spaCy_tokens'].apply(lambda x: ' '.join(x))
    print_statistics_after_applying_technique(printed_df, 'nltk_tokens')
    print_statistics_after_applying_technique(printed_df, 'spaCy_tokens')
    df.to_csv('spam_tokenized.csv')

In [221]:
def load_data_and_lemmatization(df: pd.DataFrame):
    analyze_lemmatization_time(df, lemmatize_nltk, 'nltk')
    analyze_lemmatization_time(df, lemmatize_spacy, 'spaCy')
    printed_df = df.copy()
    printed_df['nltk_lemmatization'] = printed_df['nltk_lemmatization'].apply(lambda x: ' '.join(x))
    printed_df['spaCy_lemmatization'] = printed_df['spaCy_lemmatization'].apply(lambda x: ' '.join(x))

    print_statistics_after_applying_technique(printed_df, 'nltk_lemmatization')
    print_statistics_after_applying_technique(printed_df, 'spaCy_lemmatization')

    df.to_csv('spam_lemmatize.csv')

In [222]:
def load_data_and_stemming(df: pd.DataFrame):
    analyze_stemming_time(df, stem_nltk, 'nltk')
    analyze_stemming_time(df, lemmatize_spacy, 'spaCy')
    printed_df = df.copy()
    printed_df['nltk_stemming'] = printed_df['nltk_stemming'].apply(lambda x: ' '.join(x))
    printed_df['spaCy_stemming'] = printed_df['spaCy_stemming'].apply(lambda x: ' '.join(x))

    print_statistics_after_applying_technique(printed_df, 'nltk_stemming')
    print_statistics_after_applying_technique(printed_df, 'spaCy_stemming')

    df.to_csv('spam_stemming.csv')

### *NLTK Analyze:*

    The output of the NLTK tokenization is a list of tokens for each sentence, it is a simple tokenization process that splits the text by space and punctuation, for example it will split "I'm" into "I" and "'m".
    The output of the NLTK lemmatization is a list of lemmatized tokens for each sentence, it is a simple lemmatization process that reduces the words to their base form, In nltk it didnt success to reduce the word to its form, for example "searching" stay "searching", It didnt preform well.
    we used the porter stemmer for stemming the text.  stemming algorithms are known for their simplicity and effectiveness. It applies a series of rules to iteratively strip suffixes from words.
    The proccessing speed is very fast, in this case it took 0.3 seconds but the result is not accurate. 
    It is primarily designed for english.
    The complexity for tokenizing/lemmatization/stemming each row is 𝑂(𝑛) therefore tokenizing the entire file would be 𝑂(𝑚⋅𝑛), where 𝑚 is the average length of the text and 𝑛 is the number of rows.
    
### *spaCy Analyze:*

    The output of the spaCy tokenization is a list of tokens for each sentence, it is a more complex tokenization process that takes into account the context of the words, for example it will not split "I'm" into "I" and "'m" and will tokenize it into "I'm".
    The output of the spaCy lemmatization is a list of lemmatized tokens for each sentence, it is a simple lemmatization process that reduces the words to their base form, for example it will reduce "running" to "run".
    SpaCy doesn't contain any function for stemming as it relies on lemmatization only
    The proccessing speed is slower than NLTK, in this case it took 313 seconds and the result is more accurate than NLTK.
    it suppport various languages.
    The complexity for tokenizing/lemmatization/stemming each row is 𝑂(𝑛) therefore tokenizing the entire file would be 𝑂(𝑚⋅𝑛), where 𝑚 is the average length of the text and 𝑛 is the number of rows.

In [223]:
spam_df = load_data('spam.csv')

In [224]:
print_basic_statistics(spam_df)

Total number of SMS messages: 5572
Spam to ham ratio: 0.15
Average word length: 80.12
5 most frequent words: [('to', 2134), ('you', 1622), ('I', 1466), ('a', 1327), ('the', 1197)]
Number of words that only appear once: 9268


In [225]:
load_data_and_tokenize(spam_df)

Tokenization with nltk took 0.8406 seconds
Tokenization with spaCy took 35.3593 seconds
Average word length: 83.26
5 most frequent words: [('.', 4886), ('to', 2148), ('I', 1956), ('you', 1888), (',', 1871)]
Number of words that only appear once: 6187
Average word length: 83.17
5 most frequent words: [('.', 4945), ('to', 2148), ('I', 1988), ('you', 1878), (',', 1857)]
Number of words that only appear once: 6272


In [226]:
load_data_and_lemmatization(spam_df)

Lemmatization with nltk took 0.8883 seconds
Lemmatization with spaCy took 33.2866 seconds
Average word length: 82.68
5 most frequent words: [('.', 4886), ('to', 2148), ('I', 1956), ('you', 1888), (',', 1871)]
Number of words that only appear once: 5903
Average word length: 81.10
5 most frequent words: [('.', 4945), ('I', 3722), ('be', 3260), ('to', 2309), ('you', 2217)]
Number of words that only appear once: 5359


In [227]:
load_data_and_stemming(spam_df)

Stemming with nltk took 1.6674 seconds
Stemming with spaCy took 30.7978 seconds
Average word length: 79.26
5 most frequent words: [('.', 4886), ('i', 2900), ('to', 2241), ('you', 2228), (',', 1871)]
Number of words that only appear once: 4179
Average word length: 81.10
5 most frequent words: [('.', 4945), ('I', 3722), ('be', 3260), ('to', 2309), ('you', 2217)]
Number of words that only appear once: 5359


# 

In [228]:
url = 'https://news.sky.com/story/warning-to-uk-politicians-over-risk-of-audio-deepfakes-that-could-derail-the-general-election-13146573'

def scrape_text_from_profile(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    text_data = []
    
    scraped_data = soup.find_all('p')
    
    for text in scraped_data:
        text_data.append(text.get_text())
        
    return text_data
      
scraped_text = scrape_text_from_profile(url)


#### The website that we chose to scrape is sky news
#### We chose to perform tokenization and lemmatization using Spacy and stemming from nltk

### *Statistics before text proccesing*   

In [229]:
df_scraped_text = pd.DataFrame({"Scraped Text": scraped_text})
print_statistics_after_applying_technique(df_scraped_text, 'Scraped Text')

Average word length: 150.46
5 most frequent words: [('the', 45), ('to', 43), ('and', 33), ('of', 28), ('in', 17)]
Number of words that only appear once: 448


In [230]:
df_scraped_text['scraped_tokens'] = df_scraped_text['Scraped Text'].apply(tokenize_spacy)

In [231]:
df_scraped_text['scraped_lemmatization'] = df_scraped_text['Scraped Text'].apply(lemmatize_spacy)

In [232]:
df_scraped_text['scraped_stemming'] = df_scraped_text['Scraped Text'].apply(stem_nltk)

In [233]:
scraped_df = df_scraped_text.copy()
scraped_df['scraped_tokens'] = scraped_df['scraped_tokens'].apply(lambda x: ' '.join(x))
scraped_df['scraped_stemming'] = scraped_df['scraped_stemming'].apply(lambda x: ' '.join(x))
scraped_df['scraped_lemmatization'] = scraped_df['scraped_lemmatization'].apply(lambda x: ' '.join(x))

In [234]:
print_statistics_after_applying_technique(scraped_df, 'scraped_tokens')

Average word length: 154.65
5 most frequent words: [(',', 58), ('the', 45), ('.', 44), ('to', 43), ('and', 33)]
Number of words that only appear once: 373


In [235]:
print_statistics_after_applying_technique(scraped_df, 'scraped_stemming')

Average word length: 135.98
5 most frequent words: [(',', 58), ('the', 52), ('.', 44), ('to', 43), ('and', 34)]
Number of words that only appear once: 303


In [236]:
print_statistics_after_applying_technique(scraped_df, 'scraped_lemmatization')

Average word length: 148.57
5 most frequent words: [(',', 58), ('be', 54), ('the', 52), ('.', 44), ('to', 43)]
Number of words that only appear once: 310


## *WhatsApp Analysis*

In [237]:
with open('_chat.txt', 'r', encoding='utf-8') as file:
    content = file.read()

data = []
for row in content.split('\n'):
    data.append(row[row.rfind(':') + 1:])

whatsapp_df = pd.DataFrame({"whatsapp_text":data})


In [238]:
print_statistics_after_applying_technique(whatsapp_df, 'whatsapp_text')

Average word length: 25.33
5 most frequent words: [('אני', 30), ('את', 29), ('לא', 26), ('לי', 22), ('זה', 18)]
Number of words that only appear once: 501


In [239]:
whatsapp_df['scraped_tokens'] = whatsapp_df['whatsapp_text'].apply(tokenize_nltk)

In [240]:
whatsapp_df['scraped_lemmatization'] = whatsapp_df['whatsapp_text'].apply(lemmatize_nltk)

In [241]:
whatsapp_df['scraped_stemming'] = whatsapp_df['whatsapp_text'].apply(stem_nltk)

In [242]:
whatsapp_df_copy = whatsapp_df.copy()
whatsapp_df_copy['scraped_tokens'] = whatsapp_df_copy['scraped_tokens'].apply(lambda x: ' '.join(x))
whatsapp_df_copy['scraped_stemming'] = whatsapp_df_copy['scraped_stemming'].apply(lambda x: ' '.join(x))
whatsapp_df_copy['scraped_lemmatization'] = whatsapp_df_copy['scraped_lemmatization'].apply(lambda x: ' '.join(x))

In [243]:
print_statistics_after_applying_technique(whatsapp_df_copy, 'scraped_tokens')

Average word length: 24.64
5 most frequent words: [('?', 43), ('אני', 30), ('את', 29), ('לא', 26), ('לי', 22)]
Number of words that only appear once: 479


In [244]:
print_statistics_after_applying_technique(whatsapp_df_copy, 'scraped_stemming')

Average word length: 24.51
5 most frequent words: [('?', 43), ('אני', 30), ('את', 29), ('לא', 26), ('לי', 22)]
Number of words that only appear once: 476


In [245]:
print_statistics_after_applying_technique(whatsapp_df_copy, 'scraped_lemmatization')

Average word length: 24.63
5 most frequent words: [('?', 43), ('אני', 30), ('את', 29), ('לא', 26), ('לי', 22)]
Number of words that only appear once: 479


In [246]:
whatsapp_df_copy.drop(columns="word_length", inplace=True)
whatsapp_df_copy.to_excel('whatsapp_df.xlsx')

### After analyzing the results of the whatsapp scraped text we can infer that both nltk and spacy don't support the hebrew language