In [106]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import time
from nltk.stem import WordNetLemmatizer
import spacy
from nltk.stem import PorterStemmer
import requests
from bs4 import BeautifulSoup


# *Statistics:*

In [107]:
def load_data(csv_file_path):
    df = pd.read_csv(csv_file_path, encoding='latin-1')
    df.columns = ['v1', 'v2', 'v3', 'v4', 'v5']
    return df

In [108]:
def print_total_sms(df: pd.DataFrame):
    total_messages = df.shape[0]
    print(f'Total number of SMS messages: {total_messages}')
    
def print_spam_ham_ratio(df: pd.DataFrame):
    spam_count = df[df['v1'] == 'spam'].shape[0]
    ham_count = df[df['v1'] == 'ham'].shape[0]
    print(f'Spam to ham ratio: {spam_count / ham_count:.2f}')

def average_word_length(df: pd.DataFrame, column_name):
    df['word_length'] = df[column_name].apply(lambda x: len(x))
    avg_word_length = df['word_length'].mean()
    print(f'Average word length: {avg_word_length:.2f}')
    
def most_common_words(df: pd.DataFrame, column_name):
    all_words = ' '.join(df[column_name]).split()
    most_common = Counter(all_words).most_common(5)
    print('5 most frequent words:', most_common)

def number_of_words_once(df: pd.DataFrame, column_name):
    all_words = ' '.join(df[column_name]).split()
    word_counts = Counter(all_words)
    words_once = sum(1 for count in word_counts.values() if count == 1)
    print(f'Number of words that only appear once: {words_once}')

def print_basic_statistics(df: pd.DataFrame):
    print_total_sms(df)
    print_spam_ham_ratio(df)
    average_word_length(df, 'v2')
    most_common_words(df, 'v2')
    number_of_words_once(df, 'v2')

def print_statistics_after_applying_technique(df: pd.DataFrame, column_name):
    average_word_length(df, column_name)
    most_common_words(df, column_name)


# *NLP Tokenization and Lemmatization:*


In [109]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to /Users/noam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/noam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/noam/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# *NLTK:*

In [110]:
def tokenize_nltk(text):
    return word_tokenize(text)

def lemmatize_nltk(text):
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(token) for token in tokens]

def stem_nltk(text):
    tokens = word_tokenize(text)
    return [stemmer.stem(token) for token in tokens]


# *spaCy:*

In [111]:
def tokenize_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc]

def lemmatize_spacy(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]


# *Time Analysis:*

In [112]:
def analyze_tokenization_time(df, tokenizer, tokenizer_name):
    start_time = time.time()
    col_name = f'{tokenizer_name}_tokens'
    df[col_name] = df['v2'].apply(tokenizer)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Tokenization with {tokenizer_name} took {elapsed_time:.4f} seconds")


In [113]:
def analyze_lemmatization_time(df, lemmatization, lemmatization_name):
    start_time = time.time()
    col_name = f'{lemmatization_name}_lemmatization'
    df[col_name] = df['v2'].apply(lemmatization)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Lemmatization with {lemmatization_name} took {elapsed_time:.4f} seconds")


In [114]:
def analyze_stemming_time(df, stemming, stemming_name):
    start_time = time.time()
    col_name = f'{stemming_name}_stemming'
    df[col_name] = df['v2'].apply(stemming)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Stemming with {stemming_name} took {elapsed_time:.4f} seconds")


In [115]:
def load_data_and_tokenize(df: pd.DataFrame):
    analyze_tokenization_time(df, tokenize_nltk, 'nltk')
    analyze_tokenization_time(df, tokenize_spacy, 'spaCy')
    print_statistics_after_applying_technique(df, 'nltk_tokens')
    print_statistics_after_applying_technique(df, 'spaCy_tokens')
    df.to_csv('spam_tokenized.csv')

In [116]:
def load_data_and_lemmatization(df: pd.DataFrame):
    analyze_lemmatization_time(df, lemmatize_nltk, 'nltk')
    analyze_lemmatization_time(df, lemmatize_spacy, 'spaCy')
    print_statistics_after_applying_technique(df, 'nltk_lemmatization')
    print_statistics_after_applying_technique(df, 'spaCy_lemmatization')

    df.to_csv('spam_lemmatize.csv')

In [117]:
def load_data_and_stemming(df: pd.DataFrame):
    analyze_stemming_time(df, stem_nltk, 'nltk')
    analyze_stemming_time(df, lemmatize_spacy, 'spaCy')
    print_statistics_after_applying_technique(df, 'nltk_stemming')
    print_statistics_after_applying_technique(df, 'spaCy_stemming')

    df.to_csv('spam_stemming.csv')

### *NLTK Analyze:*

    The output of the NLTK tokenization is a list of tokens for each sentence, it is a simple tokenization process that splits the text by space and punctuation, for example it will split "I'm" into "I" and "'m".
    The output of the NLTK lemmatization is a list of lemmatized tokens for each sentence, it is a simple lemmatization process that reduces the words to their base form, In nltk it didnt success to reduce the word to its form, for example "searching" stay "searching", It didnt preform well.
    we used the porter stemmer for stemming the text.  stemming algorithms are known for their simplicity and effectiveness. It applies a series of rules to iteratively strip suffixes from words.
    The proccessing speed is very fast, in this case it took 0.3 seconds but the result is not accurate. 
    It is primarily designed for english.
    The complexity for tokenizing/lemmatization/stemming each row is 𝑂(𝑛) therefore tokenizing the entire file would be 𝑂(𝑚⋅𝑛), where 𝑚 is the average length of the text and 𝑛 is the number of rows.
    
### *spaCy Analyze:*

    The output of the spaCy tokenization is a list of tokens for each sentence, it is a more complex tokenization process that takes into account the context of the words, for example it will not split "I'm" into "I" and "'m" and will tokenize it into "I'm".
    The output of the spaCy lemmatization is a list of lemmatized tokens for each sentence, it is a simple lemmatization process that reduces the words to their base form, for example it will reduce "running" to "run".
    SpaCy doesn't contain any function for stemming as it relies on lemmatization only
    The proccessing speed is slower than NLTK, in this case it took 313 seconds and the result is more accurate than NLTK.
    it suppport various languages.
    The complexity for tokenizing/lemmatization/stemming each row is 𝑂(𝑛) therefore tokenizing the entire file would be 𝑂(𝑚⋅𝑛), where 𝑚 is the average length of the text and 𝑛 is the number of rows.

In [118]:
spam_df = load_data('spam.csv')

In [119]:
print_basic_statistics(spam_df)

Total number of SMS messages: 5572
Spam to ham ratio: 0.15
Average word length: 80.12
5 most frequent words: [('to', 2134), ('you', 1622), ('I', 1466), ('a', 1327), ('the', 1197)]
Number of words that only appear once: 9268


In [120]:
load_data_and_tokenize(spam_df)

Tokenization with nltk took 0.3815 seconds
Tokenization with spaCy took 29.1611 seconds
Average word length: 18.70


TypeError: sequence item 0: expected str instance, list found

In [None]:
load_data_and_lemmatization(spam_df)

In [None]:
load_data_and_stemming(spam_df)

# 

In [None]:

# Replace this with the URL of the public page you want to scrape
url = 'https://example.com/public-profile'

def scrape_text_from_profile(url):
    # Fetch the HTML content of the page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract text data
    # Adjust the tag and class names based on the structure of the page you are scraping
    # For example, to extract all paragraphs:
    text_data = []
    for paragraph in soup.find_all('p'):
        text_data.append(paragraph.get_text())

    # Print the extracted text data
    for text in text_data:
        print(text)

# Example usage
scrape_text_from_profile(url)
