## Importing standard libraries

In [None]:
!pip install datasets # - if using colab
import pandas as pd
import numpy as np



## Loading in a dataset for checking templates

In [None]:
from datasets import load_dataset

# Loading in the imdb dataset for full text preprocessing
# imdb_dataset = load_dataset("imdb")

# Loading in the amazon reviews dataset for mixed text/different data type preprocessing
amazon_dataset = load_dataset("amazon_polarity")

# Checking dataset info
# print(imdb_dataset)
print(amazon_dataset)

# Checking column information
# print(imdb_dataset['train'].column_names)
print(amazon_dataset['train'].column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/260M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})
['label', 'title', 'content']


### Viewing datasets contents

In [None]:
# Viewing the first 2 rows from IMDb train split
'''for i in range(2):
    print(imdb_dataset['train'][i])'''

# Viewing the first 2 rows from Amazon train split
for i in range(2):
    print(amazon_dataset['train'][i])


{'label': 1, 'title': 'Stuning even for the non-gamer', 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}
{'label': 1, 'title': 'The best soundtrack ever to anything.', 'content': "I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth ev

## General Text Cleaning Template - Word Only - Might Depreciate - NLTK (Slow) Version

In [None]:
from joblib import Parallel, delayed
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Ensures NLTK looks here - used for colab

os.environ['NLTK_DATA'] = '/content/nltk_data'

nltk.data.path = [os.environ['NLTK_DATA']]

nltk.download('punkt', download_dir=os.environ['NLTK_DATA'])
nltk.download('punkt_tab', download_dir=os.environ['NLTK_DATA'])
nltk.download('stopwords', download_dir=os.environ['NLTK_DATA'])
nltk.download('wordnet', download_dir=os.environ['NLTK_DATA'])
nltk.download('omw-1.4', download_dir=os.environ['NLTK_DATA'])

def clean_text(df, columns, lowercase=True, remove_stopwords=True, stemming=False, lemmatization=True, remove_punctuation=True, n_jobs=-1):
  '''
   Cleans text data in the specified columns by applying optional steps like lowercasing, stopword removal, stemming, and lemmatization.

   Parameters:
    df: The pandas dataframe or dictionary containing the Dataframe
    columns: Type-str, The name of the columns to apply the template to. Supports both multiple and single columns
    lowercasing: Converts the text to lowercase
    remove_stopwords: Removes stopwords from the column.
    stemming: Applies stemming to the columns
    lemmatization: Applies lemmatization to the columns
    remove_punctuation: Removes punctuation from the text columns. Defaulted to True but for tasks like sentiment analysis can change to False.
    n_jobs: Type-int, Default=-1, The number of parallel jobs to run (-1 uses all available CPUs) - Much faster preprocessing

   Info:
    Stopwords - Stop words are common words that are removed from text before analysis because they don't add much meaning. (a, and, the)
    Stemming - Stemming reduces words to their root form, or stem. ((Swimming, Swims) - Swim)
    Lemmatization - Lemmatization reduces words to their root form, or lemma. (better - good) ((running, ran, runs) - run)

    Which to use: Stemming just removes common suffixes from the end of word tokens, lemmatization ensures the output word is an existing normalized form of the word.
    Stemming - Use when speed is more important than accuracy, and you can tolerate some inaccuracy
    Lemmatization - Use when precision and meaningfulness of words are more important than speed

   Returns:
    df: Dataframe with cleaned columns
  '''
  # Forcing path env for download errors
  nltk.data.path = ['/content/nltk_data']

  # Initializing the nltk tools
  stop_words = set(stopwords.words('english'))
  ps = PorterStemmer()
  lemmatizer = WordNetLemmatizer()

  def process_text(text):
    # Lowercasing the text
    if lowercase:
      text = text.lower()

    # Removing punctuation
    if remove_punctuation:
      text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenizing the text
    tokens = nltk.word_tokenize(text)

    # Removing stopwords
    if remove_stopwords:
      tokens = [word for word in tokens if word not in stop_words]

    # Applying stemming
    if stemming:
      tokens = [ps.stem(word) for word in tokens]

    # Applying lemmatization
    if lemmatization:
      tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

  # Checking each column in the list of columns
  for column in columns:

    if column not in df.columns:
      raise ValueError(f"Column '{column}' does not exist in the dataframe.")

    if not pd.api.types.is_string_dtype(df[column]):
      raise ValueError(f"Column '{column}' is not of string type and cannot be processed.")

    # Parallelize the cleaning process
    df.loc[:, column] = Parallel(n_jobs=n_jobs)(delayed(process_text)(text) for text in df[column])

  return df


[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /content/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /content/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Testing the general cleaning template

In [None]:
'''# Converting the IMDb train data into a pandas DataFrame for compatibility
imdb_df = pd.DataFrame(imdb_dataset['train'])

# Applying the cleaning function to the "text" and "content" columns
cleaned_imdb_df = clean_text(imdb_df, columns=['text']) # Leaving everything else as defaults

# Viewing the cleaned row
print("\nCleaned imdb Text:")
print(cleaned_imdb_df['text'].iloc[0])'''


amazon_sample = pd.DataFrame(amazon_dataset['train']).sample(10000)

cleaned_amazon_sample = clean_text(amazon_sample, columns=['title', 'content']) # Leaving everything else as defaults

print("\nCleaned Amazon Text:")
print(cleaned_amazon_sample[['title', 'content']].head())



Cleaned Amazon Text:
                                 title  \
3465105  shockingly full ghastly error   
1548950                 turandot stink   
3530811                       dont buy   
162235              hellblazers review   
2595775                  great product   

                                                   content  
3465105  good thing doesnt use hebrew writing inside co...  
1548950  somebody sue alexander rahbari omitting import...  
3530811  suck hard dried throw mony trash never use ser...  
162235   great song cd would say evolution theme line s...  
2595775  product delievered promised picture matched pr...  


## Advanced Cleaning Text Template - Word + Sentence Choice - NLTK (Slow) Version

In [None]:
import os
import pandas as pd
import nltk
import string
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

# Ensure NLTK resources are downloaded and set the data path (adjust the path if needed)
os.environ['NLTK_DATA'] = '/content/nltk_data'
nltk.data.path = [os.environ['NLTK_DATA']]
nltk.download('punkt', download_dir=os.environ['NLTK_DATA'])
nltk.download('punkt_tab', download_dir=os.environ['NLTK_DATA'])
nltk.download('stopwords', download_dir=os.environ['NLTK_DATA'])
nltk.download('wordnet', download_dir=os.environ['NLTK_DATA'])
nltk.download('omw-1.4', download_dir=os.environ['NLTK_DATA'])

def preprocess_text(df, cleaning_type='word', columns=None, text_column=None,
                    lowercase=True, remove_stopwords=True, stemming=False,
                    lemmatization=True, remove_punctuation=True, n_jobs=-1):
    '''
    Preprocess text data in a Pandas DataFrame using either word-level or sentence-level cleaning.

    Parameters:
        df: Pandas DataFrame containing the dataset.
        cleaning_type: str, either 'word' or 'sentence'.
            - 'word': Applies cleaning to entire text in the specified columns.
            - 'sentence': Splits text in a column into sentences and cleans each sentence.
        columns: list of str, required if cleaning_type is 'word'. The column names to clean.
        text_column: str, required if cleaning_type is 'sentence'. The column containing full articles.
        lowercase: bool, default True. Converts text to lowercase.
        remove_stopwords: bool, default True. Removes stopwords from the text.
        stemming: bool, default False. Applies stemming to tokens.
        lemmatization: bool, default True. Applies lemmatization to tokens.
        remove_punctuation: bool, default True. Removes punctuation from the text.
        n_jobs: int, default -1. Number of parallel jobs (-1 uses all available CPUs).

    Returns:
        If cleaning_type is 'word': Returns the original DataFrame with cleaned text in the specified columns.
        If cleaning_type is 'sentence': Returns a new DataFrame with each row as a cleaned sentence and an 'article_id'
                                       corresponding to the original row index.

    Notes:
        - Stopwords are common words (like 'a', 'the') that might be removed to focus on meaningful words.
        - Stemming reduces words to their root form (e.g., 'running' becomes 'run'), though it may yield non-dictionary forms.
        - Lemmatization reduces words to their dictionary form (e.g., 'better' becomes 'good') for more precise output.
    '''

    # Forcing path env for download errors
    nltk.data.path = ['/content/nltk_data']

    # Initialize common NLP tools
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Define helper for word-level processing
    def process_text_word(text):
        if lowercase:
            text = text.lower()
        if remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        if remove_stopwords:
            tokens = [word for word in tokens if word not in stop_words]
        if stemming:
            tokens = [ps.stem(word) for word in tokens]
        if lemmatization:
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)

    # Define helper for sentence-level processing
    def process_text_sentence(text):
        # Split the text into sentences first
        sentences = sent_tokenize(text)
        clean_sentences = []
        for sent in sentences:
            if lowercase:
                sent = sent.lower()
            if remove_punctuation:
                sent = sent.translate(str.maketrans('', '', string.punctuation))
            tokens = word_tokenize(sent)
            if remove_stopwords:
                tokens = [word for word in tokens if word not in stop_words]
            if stemming:
                tokens = [ps.stem(word) for word in tokens]
            if lemmatization:
                tokens = [lemmatizer.lemmatize(word) for word in tokens]
            clean_sentences.append(' '.join(tokens))
        return clean_sentences

    # Choose cleaning method based on the cleaning_type parameter
    if cleaning_type == 'word':
        if columns is None:
            raise ValueError("For word-level cleaning, please provide a list of column names in 'columns'.")
        for col in columns:
            if col not in df.columns:
                raise ValueError(f"Column '{col}' does not exist in the DataFrame.")
            if not pd.api.types.is_string_dtype(df[col]):
                raise ValueError(f"Column '{col}' must be of string type.")
            # Apply cleaning in parallel on each column's text
            df.loc[:, col] = Parallel(n_jobs=n_jobs)(
                delayed(process_text_word)(text) for text in df[col]
            )
        return df

    elif cleaning_type == 'sentence':
        if text_column is None:
            raise ValueError("For sentence-level cleaning, please provide the 'text_column' parameter.")
        if text_column not in df.columns:
            raise ValueError(f"Column '{text_column}' does not exist in the DataFrame.")
        # Process each row's text in parallel; each returns a list of cleaned sentences
        cleaned_lists = Parallel(n_jobs=n_jobs)(
            delayed(process_text_sentence)(text) for text in df[text_column]
        )
        # Build a new DataFrame where each row is a cleaned sentence along with its original row index as article_id
        sentence_data = []
        for article_id, sentences in enumerate(cleaned_lists):
            for sentence in sentences:
                sentence_data.append({'article_id': article_id, 'sentence': sentence})
        sentence_df = pd.DataFrame(sentence_data)
        return sentence_df

    else:
        raise ValueError("Invalid cleaning_type. Choose either 'word' or 'sentence'.")

[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /content/nltk_data...
[nltk_data] Downloading package omw-1.4 to /content/nltk_data...


### Testing the Advanced Cleaning Template

In [None]:
amazon_sample = pd.DataFrame(amazon_dataset['train']).sample(10000)

cleaned_amazon_sample = preprocess_text(amazon_sample, cleaning_type='sentence', text_column='content') # Leaving everything else as defaults

print("\nCleaned Amazon Text:")
print(cleaned_amazon_sample.head())


Cleaned Amazon Text:
   article_id                                          sentence
0           0  bought unit christmas last year worked fine june
1           0                              spent 15 send repair
2           0                             sent refurbished unit
3           0                              new unit barely work
4           0                              2 unit 1 year broken


## Advanced Cleaning Text Template - Word + Sentence Choice - spaCy (Fast) Version

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
import pandas as pd
import string
from joblib import Parallel, delayed

def get_nlp():
    """
    Lazy loads the spaCy English model and adds a sentence boundary detector.
    Each worker process will load the model on its first call.
    """
    if not hasattr(get_nlp, "nlp"):
        get_nlp.nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
        # **Add Sentencizer to enable sentence splitting**
        sentencizer = get_nlp.nlp.add_pipe("sentencizer")
    return get_nlp.nlp

# Note for Colab/local usage:
# - Ensure spaCy is installed: pip install spacy
# - Ensure the English model is downloaded:
#    Local: python -m spacy download en_core_web_sm
#    Colab: !python -m spacy download en_core_web_sm

def clean_text(df, columns, lowercase=True, remove_stopwords=True,
                    lemmatization=True, remove_punctuation=True, batch_size=50):
    '''
    Cleans text data in the specified columns by applying optional steps like lowercasing,
    stopword removal, stemming, and lemmatization using spaCy.

    Parameters:
        df: The pandas dataframe containing the data.
        columns: Type-str or list of str, The name(s) of the columns to apply the template to.
        lowercase: bool, Converts the text to lowercase.
        remove_stopwords: bool, Removes stopwords from the text.
        lemmatization: bool, Applies lemmatization to tokens.
        remove_punctuation: bool, Removes punctuation from the text. Default True.
        n_jobs: int, Default=-1, The number of parallel jobs (-1 uses all available CPUs).

    Info:
        Stopwords - Common words (e.g., 'a', 'and', 'the') that may be removed to focus on meaningful words.
        Lemmatization - Converts words to their dictionary form (e.g., 'better' -> 'good').

        Which to use:
            - Use stemming if speed is key and minor inaccuracies are acceptable (not available here).
            - Use lemmatization when you need precise, dictionary-based forms.

    Returns:
        df: Dataframe with cleaned columns.
    '''

    def process_texts(texts):
        nlp = get_nlp()
        docs = nlp.pipe(texts, batch_size=batch_size)
        return [
            ' '.join(
                [token.lemma_ if lemmatization else token.text
                 for token in doc
                 if not (remove_punctuation and token.is_punct) and
                    not (remove_stopwords and token.is_stop)]
            ) for doc in docs
        ]

    # Process each column in batches
    for column in columns:
        if column not in df.columns:
            raise ValueError(f"Column '{column}' does not exist in the dataframe.")
        df.loc[:, column] = process_texts(df[column].tolist())

    return df

def preprocess_text(df, cleaning_type='word', columns=None, text_column=None,
                    lowercase=True, remove_stopwords=True, lemmatization=True,
                    remove_punctuation=True, n_jobs=-1):
    '''
    Preprocess text data in a Pandas DataFrame using either word-level or sentence-level cleaning using spaCy.

    Parameters:
        df: Pandas DataFrame containing the dataset.
        cleaning_type: str, either 'word' or 'sentence'.
            - 'word': Applies cleaning to entire text in the specified columns.
            - 'sentence': Splits text in a column into sentences and cleans each sentence.
        columns: list of str, required if cleaning_type is 'word'. The column names to clean.
        text_column: str, required if cleaning_type is 'sentence'. The column containing full articles.
        lowercase: bool, default True. Converts text to lowercase.
        remove_stopwords: bool, default True. Removes stopwords from the text.
        lemmatization: bool, default True. Applies lemmatization to tokens.
        remove_punctuation: bool, default True. Removes punctuation from the text.
        n_jobs: int, default -1. Number of parallel jobs (-1 uses all available CPUs).

    Returns:
        If cleaning_type is 'word': Returns the original DataFrame with cleaned text in the specified columns.
        If cleaning_type is 'sentence': Returns a new DataFrame with each row as a cleaned sentence and an 'article_id'
                                       corresponding to the original row index.

    Notes:
        - Stopwords are common words (like 'a', 'the') that might be removed to focus on meaningful words.
        - Lemmatization reduces words to their dictionary form (e.g., 'better' becomes 'good') for more precise output.

    Note:
        - For local usage, ensure the spaCy model is downloaded: python -m spacy download en_core_web_sm
        - In Colab, you might need to run: !python -m spacy download en_core_web_sm
    '''

    # Helper for word-level processing using spaCy
    def process_text_word(text):
        if lowercase:
            text = text.lower()
        nlp = get_nlp()
        doc = nlp(text)
        tokens = []
        for token in doc:
            if remove_punctuation and token.is_punct:
                continue
            if remove_stopwords and token.is_stop:
                continue
            tokens.append(token.lemma_ if lemmatization else token.text)
        return ' '.join(tokens)

    # Helper for sentence-level processing using spaCy
    def process_text_sentence(text):
        """
        **FIXED:** Sentence processing now runs sequentially instead of joblib parallel.
        """
        nlp = get_nlp()
        doc = nlp(text)
        clean_sentences = []
        for sent in doc.sents:
            tokens = [
                token.lemma_ if lemmatization else token.text
                for token in sent
                if not (remove_punctuation and token.is_punct) and not (remove_stopwords and token.is_stop)
            ]
            clean_sentences.append(' '.join(tokens))
        return clean_sentences

    if cleaning_type == 'word':
        if columns is None:
            raise ValueError("For word-level cleaning, please provide a list of column names in 'columns'.")
        for col in columns:
            if col not in df.columns:
                raise ValueError(f"Column '{col}' does not exist in the DataFrame.")
        df.loc[:, columns] = df[columns].applymap(process_text_word)
        return df

    elif cleaning_type == 'sentence':
        if text_column is None:
            raise ValueError("For sentence-level cleaning, please provide the 'text_column' parameter.")
        if text_column not in df.columns:
            raise ValueError(f"Column '{text_column}' does not exist in the DataFrame.")

        # Run **sequentially** (No joblib here)
        cleaned_lists = df[text_column].apply(process_text_sentence)

        # Build new DataFrame with cleaned sentences
        sentence_data = []
        for article_id, sentences in enumerate(cleaned_lists):
            for sentence in sentences:
                sentence_data.append({'article_id': article_id, 'sentence': sentence})
        sentence_df = pd.DataFrame(sentence_data)
        return sentence_df

    else:
        raise ValueError("Invalid cleaning_type. Choose either 'word' or 'sentence'.")


### Testing the Advanced Cleaning Template - Fast Version

In [None]:
# Convert the train split to a pandas DataFrame
amazon_train_df = amazon_dataset['train'].to_pandas()

# Convert the string columns explicitly to avoid read-only memory issues
amazon_train_df["title"] = amazon_train_df["title"].astype(str).copy()
amazon_train_df["content"] = amazon_train_df["content"].astype(str).copy()

# Take a sample for faster processing
amazon_train_df = amazon_train_df.sample(10000)

# Word level cleaning
word_cleaned_train_df = clean_text(
    amazon_train_df,
    columns=['title', 'content'],
)

print('Word leveling cleaning:\n')
print(word_cleaned_train_df.head())


# Sentence leveling cleaning
sentence_cleaned_sentence_df = preprocess_text(
    amazon_train_df,
    cleaning_type='sentence',
    text_column='content',
)

# The result is a new DataFrame where each row corresponds to a cleaned sentence
# along with an 'article_id' corresponding to the original row index.
print('\nSentence leveling cleaning:\n')
print(sentence_cleaned_sentence_df.head())



Word leveling cleaning:

         label            title  \
841664       1   Pink Floyd God   
2824140      1            Prime   
2485286      0   Thunderbird U.   
3421104      1  promise new duo   
3022752      0             stop   

                                                   content  
841664   Pink Floyd master commander duplicate musical ...  
2824140  try save money water conditioner stress coat w...  
2485286  Thunderbird American Graduate School Internati...  
3421104  newcomer duo nice sound voice blend predict go...  
3022752  stop make Medal Honor game stupid a.i idiotic ...  

Sentence leveling cleaning:

   article_id                                           sentence
0           0  pink Floyd master commander duplicate musical ...
1           1  try save money water conditioner stress coat w...
2           2  Thunderbird American Graduate School Internati...
3           3  newcomer duo nice sound voice blend predict go...
4           4  stop Medal Honor game stupid

## Ranking Feature Extraction - Word + Sentence

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_features(df, feature_level='sentence', text_column=None):
    '''
    Extracts features for ranking from text data at either the sentence or word level.

    For sentence-level data (each row is a sentence), it computes:
      - Normalized position of the sentence (row index divided by total number of sentences).
      - Sentence length (number of words).
      - Average TF-IDF score of the sentence.

    For word-level data (each row is a word), it computes:
      - Normalized position of the word (row index divided by total number of words).
      - Word length (number of characters).
      - TF-IDF score of the word (since each document is just one word, summing the TF-IDF vector works well).

    Parameters:
        df: Pandas DataFrame containing the text data.
        feature_level: str, either 'sentence' or 'word'.
        text_column: str, name of the column containing the text (sentences or words).

    Returns:
        feature_matrix: NumPy array of shape (n, 3) with columns:
                        [normalized_position, length, tfidf_feature]
                        where 'length' is word count (for sentences) or character count (for words),
                        and 'tfidf_feature' is the average TF-IDF (sentence) or sum TF-IDF (word).
    '''
    if text_column is None:
        raise ValueError("Please provide the 'text_column' parameter.")

    n = len(df)
    # Compute normalized positions for each row.
    positions = np.arange(n) / n

    # Initialize and fit TF-IDF vectorizer on the provided text column.
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    # Convert TF-IDF sparse matrix to dense array for feature computation.
    tfidf_array = tfidf_matrix.toarray()

    if feature_level == 'sentence':
        # For sentences, compute the number of words per sentence.
        lengths = df[text_column].apply(lambda x: len(x.split()))
        # Calculate the average TF-IDF score per sentence.
        tfidf_feature = tfidf_array.mean(axis=1)
    elif feature_level == 'word':
        # For words, compute word length as number of characters.
        lengths = df[text_column].apply(lambda x: len(x))
        # For a single word per row, summing the TF-IDF vector gives its score.
        tfidf_feature = tfidf_array.sum(axis=1)
    else:
        raise ValueError("Invalid feature_level. Choose either 'sentence' or 'word'.")

    # Combine the features into a single matrix.
    feature_matrix = np.column_stack((positions, lengths, tfidf_feature))
    return feature_matrix

### Tesing the Feature Extraction

In [None]:
features_sentences = extract_features(cleaned_amazon_sample, text_column='sentence')

print("Sentence-level features:")
print(features_sentences)

Sentence-level features:
[[0.00000000e+00 8.00000000e+00 7.28027722e-05]
 [2.14905871e-05 4.00000000e+00 5.24589235e-05]
 [4.29811742e-05 3.00000000e+00 4.46189938e-05]
 ...
 [9.99935528e-01 7.00000000e+00 6.77166189e-05]
 [9.99957019e-01 2.00000000e+00 3.71077019e-05]
 [9.99978509e-01 2.00000000e+00 3.68929111e-05]]


## TF-IDF Vectorizer Template - For Smaller Datasets

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorize(df, columns, max_features=1000, ngram_range=(1, 1), max_df=1.0, min_df=1, stop_words='english', return_vectorizer=True):
    '''
    Applies TF-IDF vectorization to the specified text columns in a DataFrame. - Meant for simpler datasets

    Parameters:
        df: Pandas DataFrame containing the dataset
        columns: Type-list of str, The list of columns to vectorize
        max_features: Type-int, Default=1000, Maximum number of features to keep
        ngram_range: Type-tuple, Default=(1, 1), The range of n-grams to consider (e.g., (1, 2) for unigrams and bigrams)
        max_df: Type-float, Default=1.0, Ignore terms that appear in more than max_df proportion of documents
        min_df: Type-int, Default=1, Ignore terms that appear in fewer than min_df documents
        stop_words: Type-str or list, Default='english', Stop words to remove (e.g., 'english' or custom list)
        return_vectorizer: Default=True, Whether to return the vectorizer object along with the feature matrix - if True It "remembers" how the text was transformed

    Info:
        Matrix Only (return_vectorizer=False):
          Useful when you’re working with a single dataset (e.g., train data) and won’t need the vectorizer for new data.
          Saves memory and keeps things simple.

        Matrix + Vectorizer (return_vectorizer=True):
          Essential when you’ll later process unseen data (e.g., test or validation sets).
          Ensures the new data uses the same vocabulary and transformations as the training data, preventing mismatches.

        ngram_range:
          A sequence of n words or tokens from a given text.
            Unigram: Single word (e.g., "machine," "learning")
            Bigram: Two consecutive words (e.g., "machine learning," "learning models")
            Etc...

    Returns:
        X_tfidf: Sparse matrix of TF-IDF features - ready to be fed directly into the model
        vectorizer: The fitted TfidfVectorizer object (if return_vectorizer=True) - when you need to reapply the same transformation on new text data (e.g., test data, unseen data)
    '''
    # Combining all selected columns into a single text column for vectorization
    combined_text = df[columns].apply(lambda row: ' '.join(row), axis=1)

    # Initializing TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        max_df=max_df,
        min_df=min_df,
        stop_words=stop_words
    )

    # Fiting and transform the combined text data
    X_tfidf = vectorizer.fit_transform(combined_text)

    if return_vectorizer:
        return X_tfidf, vectorizer
    else:
        return X_tfidf


### Testing the TF-IDF Template

In [None]:
# Applying TF-IDF vectorization to the "text" column
X_tfidf, vectorizer = tfidf_vectorize(
    cleaned_imdb_df,
    columns=['text'],
    max_features=1000,
    ngram_range=(1, 2),
)

# Check feature names
print("TF-IDF Features:", vectorizer.get_feature_names_out()[:10])
# Check TF-IDF matrix shape
print("TF-IDF Matrix Shape:", X_tfidf.shape)


TF-IDF Features: ['10' '12' '15' '20' '30' '50' '70' '80' '90' 'ability']
TF-IDF Matrix Shape: (25000, 1000)


## Keras Tokenizer + Padding Template - Word Only - Might Depreciate

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def word_tokenizer_and_padding(df, columns, max_vocab_size=10000, max_seq_length=100, padding_type='post', truncating_type='post', oov_token="<OOV>"):
    '''
    Tokenizes and pads text data for neural network models.

    Parameters:
        df: Pandas DataFrame containing the dataset
        columns: The list of columns to tokenize
        max_vocab_size: Type-int, Default=10000, Maximum size of the vocabulary (most frequent words)
        max_seq_length: Type-int, Default=100, Maximum length of sequences after padding
        padding_type: Type-str, Default='post', Padding type ('post' or 'pre')
        truncating_type: Type-str, Default='post', Truncating type ('post' or 'pre')
        oov_token: Type-str, Default='<OOV>', Token to use for out-of-vocabulary words

    Returns:
        padded_sequences: NumPy array of padded sequences
        tokenizer: The fitted Keras Tokenizer object
    '''
    # Combining text from multiple columns into a single string for tokenization
    combined_text = df[columns].apply(lambda row: ' '.join(row), axis=1)

    # Initializing the tokenizer
    tokenizer = Tokenizer(num_words=max_vocab_size, oov_token=oov_token)

    # Fitting the tokenizer on the text
    tokenizer.fit_on_texts(combined_text)

    # Transforming the text into sequences
    sequences = tokenizer.texts_to_sequences(combined_text)

    # Padding the sequences to the desired length
    padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding=padding_type, truncating=truncating_type)

    return padded_sequences, tokenizer


### Testing the Keras Tokenizer template

In [None]:
# Applying the tokenizer and padding to "title" and "content" columns
padded_sequences, tokenizer = word_tokenizer_and_padding(
    cleaned_amazon_sample,
    columns=['title', 'content'],
    max_vocab_size=10000,
    max_seq_length=100,
    padding_type='post',
    truncating_type='post'
)

# View results
print("Padded Sequences Shape:", padded_sequences.shape)
print("First Padded Sequence:", padded_sequences[0])
print("Word Index Sample:", dict(list(tokenizer.word_index.items())[:10]))

Padded Sequences Shape: (10000, 100)
First Padded Sequence: [   1  225 7611  545    6   33   82   27 2710  188  475  151    1 2214
   20 1006  188  545    1  402   20  545  986  168  595    1 9019    1
  354   66  579  234  116    2   73 7612   38 7611  545  616  166 1591
  874   39 2710    1  168    1 1391  105  226    1  118   32  529   97
  536  140 2820  185   66  372    1 3197 1293   31  722   50   87  435
 2215   32  446 1014 5911   12 1147    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
Word Index Sample: {'<OOV>': 1, 'book': 2, 'one': 3, 'great': 4, 'like': 5, 'good': 6, 'time': 7, 'would': 8, 'get': 9, 'read': 10}


## Keras Tokenization + Padding - Word + Sentence

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

def multi_tokenizer_and_padding(df, tokenization_type='sentence', columns=None, sentence_column=None,
                          max_vocab_size=10000, max_seq_length=100,
                          padding_type='post', truncating_type='post', oov_token="<OOV>"):
    '''
    Tokenizes and pads text data for neural network models, supporting two modes:

    - 'word': Combines text from multiple columns and tokenizes at the word level.
    - 'sentence': Tokenizes text in a specified sentence column (useful for sentence-level tasks).

    Parameters:
        df: Pandas DataFrame containing the dataset.
        tokenization_type: str, either 'word' or 'sentence'.
            - 'word': Tokenizes text from specified columns after combining them.
            - 'sentence': Tokenizes text from the specified sentence column.
        columns: list of str, required if tokenization_type is 'word'. The columns to combine and tokenize.
        sentence_column: str, required if tokenization_type is 'sentence'. The column containing sentences.
        max_vocab_size: int, Default=10000. Maximum size of the vocabulary (most frequent words).
        max_seq_length: int, Default=100. Maximum length of sequences after padding.
            (Note: For sentence-level tokenization, you might want to use a lower value, e.g., 50.)
        padding_type: str, Default='post'. Padding type ('post' or 'pre').
        truncating_type: str, Default='post'. Truncating type ('post' or 'pre').
        oov_token: str, Default='<OOV>'. Token to use for out-of-vocabulary words.

    Returns:
        padded_sequences: NumPy array of padded sequences.
        tokenizer: The fitted Keras Tokenizer object.
    '''
    # Validate tokenization_type input
    if tokenization_type not in ['word', 'sentence']:
        raise ValueError("Invalid tokenization_type. Choose either 'word' or 'sentence'.")

    # Initialize the tokenizer
    tokenizer = Tokenizer(num_words=max_vocab_size, oov_token=oov_token)

    if tokenization_type == 'word':
        # Ensure the required 'columns' parameter is provided
        if columns is None:
            raise ValueError("For word-level tokenization, please provide a list of column names in 'columns'.")
        # Combine text from multiple columns into a single string for each row
        combined_text = df[columns].apply(lambda row: ' '.join(row.astype(str)), axis=1)
        # Fit the tokenizer on the combined text
        tokenizer.fit_on_texts(combined_text)
        # Convert texts to sequences
        sequences = tokenizer.texts_to_sequences(combined_text)
        # Pad sequences to the desired length
        padded_sequences = pad_sequences(sequences, maxlen=max_seq_length,
                                         padding=padding_type, truncating=truncating_type)

    elif tokenization_type == 'sentence':
        # Ensure the required 'sentence_column' parameter is provided
        if sentence_column is None:
            raise ValueError("For sentence-level tokenization, please provide the 'sentence_column' parameter.")
        # Fit the tokenizer on the sentences
        tokenizer.fit_on_texts(df[sentence_column])
        # Convert sentences to sequences
        sequences = tokenizer.texts_to_sequences(df[sentence_column])
        # Pad sequences
        padded_sequences = pad_sequences(sequences, maxlen=max_seq_length,
                                         padding=padding_type, truncating=truncating_type)

    return padded_sequences, tokenizer


### Testing the Advanced Tokenizer Template

In [None]:
# Applying the tokenizer and padding to "title" and "content" columns
padded_sequences, tokenizer = multi_tokenizer_and_padding(
    cleaned_amazon_sample,
    tokenization_type='sentence',
    sentence_column='sentence',
    max_vocab_size=1000,
    max_seq_length=10,
)

# View results
print("Padded Sequences Shape:", padded_sequences.shape)
print("First Padded Sequence:", padded_sequences[0])
print("Word Index Sample:", dict(list(tokenizer.word_index.items())[:10]))

Padded Sequences Shape: (46532, 10)
First Padded Sequence: [ 29 192 407 105  22 302 233   1   0   0]
Word Index Sample: {'<OOV>': 1, 'book': 2, 'one': 3, 'like': 4, 'good': 5, 'would': 6, 'great': 7, 'time': 8, 'get': 9, 'read': 10}


## GloVe Embedding Loader Template

In [None]:
def load_glove_embeddings(filepath, embedding_dim=50, verbose=False):
    '''
    Loads GloVe word embeddings from a file into a dictionary.

    Parameters:
        filepath: Type-str, Path to the GloVe file
        embedding_dim: Type-int, Default=50, Dimensionality of the embeddings (e.g., 50, 100, 200, 300)
        verbose: Default=False, Whether to print progress information during loading

    Returns:
        embeddings_dict: dict, A dictionary mapping words to their embedding vectors
    '''
    embeddings_dict = {}

    # Open and read the GloVe file
    with open(filepath, 'r', encoding='utf-8') as f:
        if verbose:
            print(f"Loading GloVe embeddings from {filepath}...")
        for line in f:
            # Each line contains a word followed by its embedding values
            values = line.split()
            word = values[0]
            vector = [float(x) for x in values[1:]]

            # Verify embedding dimensions match
            if len(vector) == embedding_dim:
                embeddings_dict[word] = vector
            else:
                if verbose:
                    print(f"Skipping word '{word}' due to mismatched embedding size.")

    if verbose:
        print(f"Loaded {len(embeddings_dict)} word embeddings.")

    return embeddings_dict


### Testing the GloVe Embeddings - Local file loading

In [None]:
# Path to GloVe embeddings (local path)
filepath = "path/to/glove.6B.300d.txt"

# Load embeddings
glove_embeddings = load_glove_embeddings(filepath, embedding_dim=300)

# Example usage
print(f"Vector for 'king': {glove_embeddings.get('king')}")


### Testing the GloVe Embeddings - Colab file loading

In [None]:
# Downloading GloVe in Colab
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# Load embeddings
glove_embeddings = load_glove_embeddings("glove.6B.50d.txt")

# Example usage
print(f"Vector for 'queen': {glove_embeddings.get('queen')}")


--2025-02-03 13:48:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-02-03 13:48:13--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-02-03 13:48:13--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

## Embedding Matrix Setup Template

### Word Embeddings

In [None]:
def word_embedding_matrix(glove_embeddings, tokenizer, embedding_dim=50, verbose=False):
    '''
    Creates an embedding matrix by mapping tokenizer words to GloVe embeddings.

    Parameters:
        glove_embeddings: GloVe word embeddings loaded using the load_glove_embeddings function
        tokenizer: Keras Tokenizer object, Fitted tokenizer with vocabulary from the dataset
        embedding_dim: Type-int, Default=50, Dimensionality of the embeddings (must match GloVe file)
        verbose: Default=False, Whether to print progress and summary information

    Returns:
        embedding_matrix: NumPy array, Shape (vocab_size + 1, embedding_dim), Ready for use in a neural network
    '''
    # Size of the embedding matrix (vocab_size + 1 to include padding token)
    vocab_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    if verbose:
        print(f"Building embedding matrix for {vocab_size - 1} words...")

    # Populate the embedding matrix
    missing_words = 0
    for word, index in tokenizer.word_index.items():
        # Skip padding token (index 0)
        if index >= vocab_size:
            continue

        # Retrieve the embedding vector for the word
        embedding_vector = glove_embeddings.get(word)

        if embedding_vector is not None:
            # If the word exists in GloVe, use its embedding
            embedding_matrix[index] = embedding_vector
        else:
            # If the word is not in GloVe, count it as missing
            missing_words += 1

    if verbose:
        print(f"Embedding matrix created with shape: {embedding_matrix.shape}")
        print(f"Missing words: {missing_words}/{vocab_size - 1}")

    return embedding_matrix


### Sentence Embeddings

In [None]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

def get_sentence_embeddings(df, text_column, glove_embeddings, embedding_dim=50, aggregation='average', verbose=False):
    '''
    Aggregates word-level GloVe embeddings to create sentence-level embeddings.

    Parameters:
        df: Pandas DataFrame containing sentence data.
        text_column: str, name of the column with sentences.
        glove_embeddings: dict, GloVe embeddings loaded using load_glove_embeddings.
        embedding_dim: int, dimensionality of the embeddings.
        aggregation: str, 'average' (default) or 'sum' to aggregate word vectors.
        verbose: bool, whether to print warnings for sentences with no known words.

    Returns:
        sentence_embeddings: NumPy array of shape (num_sentences, embedding_dim) containing the aggregated embeddings.
    '''
    sentence_embeddings = []

    for idx, sentence in df[text_column].items():
        # Tokenize sentence into words
        tokens = word_tokenize(sentence.lower())
        vectors = []
        for token in tokens:
            vec = glove_embeddings.get(token)
            if vec is not None:
                vectors.append(np.array(vec))
        if vectors:
            vectors = np.array(vectors)
            if aggregation == 'average':
                agg_vec = vectors.mean(axis=0)
            elif aggregation == 'sum':
                agg_vec = vectors.sum(axis=0)
            else:
                raise ValueError("Invalid aggregation method. Use 'average' or 'sum'.")
        else:
            # If no word in the sentence has a glove vector, use a zero vector.
            agg_vec = np.zeros(embedding_dim)
            if verbose:
                print(f"Warning: No glove embeddings found for sentence at index {idx}.")
        sentence_embeddings.append(agg_vec)

    return np.array(sentence_embeddings)


### Testing the Word Embedding Matrix Template

In [None]:
embedding_matrix = word_embedding_matrix(glove_embeddings, tokenizer, verbose=True) # Verbose set to true for testing

print("Embedding matrix shape:", embedding_matrix.shape)

Building embedding matrix for 39260 words...
Embedding matrix created with shape: (39261, 50)
Missing words: 14189/39260
Embedding matrix shape: (39261, 50)


### Testing the Sentence Embedding Matrix Template

In [None]:
# Sample DataFrame with sentences
import pandas as pd

data_sentences = {
    'sentence': [
        "This is the first sentence.",
        "Here is another sentence, with more content.",
        "Short sentence."
    ]
}
df_sentences = pd.DataFrame(data_sentences)

sentence_embeddings = get_sentence_embeddings(df_sentences, 'sentence', glove_embeddings)
print("Sentence embeddings shape:", sentence_embeddings.shape)


Sentence embeddings shape: (3, 50)
