## Loading the Datasets for testing

In [1]:
from datasets import load_dataset

# Load IMDb (Sentiment Analysis)
imdb_dataset = load_dataset("imdb")

# Load AG News (News Classification)
ag_news_dataset = load_dataset("ag_news")

# Load Sentiment140 (Twitter Sentiment)
twitter_dataset = load_dataset("sentiment140", trust_remote_code=True)

# Print Dataset Samples
print("\nIMDb Example:", imdb_dataset["train"][0])
print("\nAG News Example:", ag_news_dataset["train"][0])
print("\nTwitter Example:", twitter_dataset["train"][0])



IMDb Example: {'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are

## Cleaning the Data

In [2]:
import re
import spacy
import contractions
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from joblib import Parallel, delayed

# Load SpaCy English model (disable unnecessary components for speed)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")  # Enable sentence segmentation

# Define common informal words & slang
INFORMAL_WORDS = {
    "gonna": "going to", "wanna": "want to", "gotta": "got to",
    "shoulda": "should have", "coulda": "could have", "woulda": "would have",
    "lemme": "let me", "gimme": "give me", "outta": "out of",
    "dunno": "do not know", "kinda": "kind of", "sorta": "sort of",
    "ain't": "is not", "ya": "you", "tho": "though", "til": "until",
    "cuz": "because", "coz": "because", "idk": "I do not know",
    "tbh": "to be honest", "btw": "by the way", "u": "you", "ur": "your",
    "r": "are"
}

def expand_informal(text):
    """Replaces informal words using our predefined dictionary."""
    words = text.split()
    words = [INFORMAL_WORDS[word] if word in INFORMAL_WORDS else word for word in words]
    return " ".join(words)

# Stopword list
STOPWORDS = nlp.Defaults.stop_words

def clean_text(text, remove_punctuation=True, remove_stopwords=True, lemmatize=True, sentence_level=False):
    """
    Cleans raw text by applying several preprocessing steps:
    - Lowercasing
    - Removing extra spaces
    - Informal word expansion
    - Punctuation removal (optional)
    - URL, email, HTML removal
    - Special character normalization
    - Stopword removal (optional)
    - Lemmatization (optional)
    - Sentence segmentation (optional)

    Parameters:
        text (str): Raw text input.
        remove_punctuation (bool): Whether to remove punctuation.
        remove_stopwords (bool): Whether to remove stopwords.
        lemmatize (bool): Whether to lemmatize tokens.
        sentence_level (bool): If True, cleans text at sentence level.

    Returns:
        str: Cleaned text.
    """
    if text is None or text.strip() == "":
        return ""

    # Lowercase text
    text = text.lower().strip()

    # Remove URLs, emails, and HTML tags
    text = re.sub(r"http[s]?://\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"<.*?>", "", text)

    # Expand contractions
    text = contractions.fix(text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Expand informal words
    text = expand_informal(text)

    # Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", " ", text)

    # Remove punctuation (if enabled)
    if remove_punctuation:
        text = re.sub(r"[^\w\s]", "", text)

    # Tokenization using spaCy (Batch Processing)
    doc = nlp(text)

    if sentence_level:
        # Split into sentences and process each one
        sentences = []
        for sent in doc.sents:
            tokens = [
                token.lemma_ if lemmatize else token.text
                for token in sent
                if not (remove_stopwords and token.text in STOPWORDS)
            ]
            sentences.append(" ".join(tokens))
        return " ".join(sentences)  # Join cleaned sentences back together
    else:
        # Process whole text as a single unit
        tokens = [
            token.lemma_ if lemmatize else token.text
            for token in doc
            if not (remove_stopwords and token.text in STOPWORDS)
        ]
        return " ".join(tokens)

def process_text_batch(texts, remove_punctuation=True, remove_stopwords=True, lemmatize=True, sentence_level=False, max_jobs=-1):
    """
    Applies text cleaning in parallel to a batch of texts.
    """
    return Parallel(n_jobs=max_jobs)(
        delayed(clean_text)(text, remove_punctuation, remove_stopwords, lemmatize, sentence_level) for text in texts
    )

def apply_cleaning(dataset, columns, batch_size=1000, sentence_level=False, max_jobs=-1):
    """
    Applies text cleaning to either a Hugging Face dataset or a pandas DataFrame.

    Parameters:
        dataset (Dataset, DatasetDict, or pd.DataFrame): The dataset to process.
        columns (str or list): Column(s) to clean.
        batch_size (int): Batch size for processing.
        sentence_level (bool or dict): If True, applies sentence-level cleaning for all columns.
                                       If a dict, specify per-column settings (e.g., {"text": True, "summary": False}).
        max_jobs (int): Number of parallel jobs (-1 for max cores).

    Returns:
        Dataset, DatasetDict, or DataFrame with cleaned text.
    """
    if isinstance(columns, str):
        columns = [columns]  # Convert to list if a single column is given

    if isinstance(dataset, pd.DataFrame):
        for column in columns:
            if column not in dataset.columns:
                raise ValueError(f"Column '{column}' not found in DataFrame.")

        cleaned_texts = {col: [] for col in columns}
        
        for i in tqdm(range(0, len(dataset), batch_size), desc="Cleaning DataFrame"):
            batch = dataset.iloc[i:i+batch_size]
            for column in columns:
                col_sentence_level = sentence_level[column] if isinstance(sentence_level, dict) else sentence_level
                cleaned_batch = process_text_batch(batch[column].tolist(), sentence_level=col_sentence_level, max_jobs=max_jobs)
                cleaned_texts[column].extend(cleaned_batch)

        for column in columns:
            dataset[column] = cleaned_texts[column]
        return dataset

    elif isinstance(dataset, (Dataset, DatasetDict)):
        return dataset.map(
            lambda batch: {col: [clean_text(text, sentence_level=(sentence_level[col] if isinstance(sentence_level, dict) else sentence_level)) for text in batch[col]] for col in columns}, 
            batched=True
        )

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")
        

### Testing the cleaning function

In [3]:
# Load Sample Data
sample_imdb = imdb_dataset["train"].select(range(5))
sample_ag_news = ag_news_dataset["train"].select(range(5))
sample_twitter = twitter_dataset["train"].select(range(5))

# Apply Text Cleaning (Word-Level)
sample_imdb = apply_cleaning(sample_imdb, columns="text")
sample_ag_news = apply_cleaning(sample_ag_news, columns="text")
sample_twitter = apply_cleaning(sample_twitter, columns="text")

# Apply Text Cleaning to Multiple Columns (Sentence-Level Example)
sample_imdb_multi = apply_cleaning(sample_imdb, columns=["text"], sentence_level={"text": True})

# Print Cleaned Examples
print("\nIMDb Cleaned Example:", sample_imdb[0]["text"])
print("\nAG News Cleaned Example:", sample_ag_news[0]["text"])
print("\nTwitter Cleaned Example:", sample_twitter[0]["text"])
print("\nIMDb (Sentence-Level) Cleaned Example:", sample_imdb_multi[0]["text"])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


IMDb Cleaned Example: rent curiousyellow video store controversy surround release 1967 hear seize yous custom try enter country fan film consider controversial myselfthe plot center young swedish drama student name lena want learn life particular want focus attention make sort documentary average swede think certain political issue vietnam war race issue united states ask politician ordinary denizen stockholm opinion politic sex drama teacher classmate marry menwhat kill curiousyellow 40 year ago consider pornographic sex nudity scene far shoot like cheaply porno countryman mind find shock reality sex nudity major staple swedish cinema ingmar bergman arguably answer good old boy john ford sex scene filmsi commend filmmaker fact sex show film show artistic purpose shock people money show pornographic theater america curiousyellow good film want study meat potato pun intend swedish cinema film plot

AG News Cleaned Example: wall st bears claw black reuters reuters   shortsellers wall st

## Tokenization

In [15]:
import re
import spacy
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import pandas as pd

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

def basic_tokenize(text):
    """Basic whitespace-based tokenizer as a fallback."""
    return re.findall(r"\b\w+\b", text.lower())  # Simple word tokenization

def tokenize_text(dataset, column="text", model=None, max_length=128, sentence_level=False):
    """
    Tokenizes text using either a Hugging Face pre-trained model or a simple tokenizer.
    Now includes padding & truncation.

    Parameters:
        dataset (Dataset, DatasetDict, or pd.DataFrame): Dataset to tokenize.
        column (str): The text column to process.
        model (str or None): Pre-trained tokenizer model (e.g., 'bert-base-uncased'). 
                             If None, uses a basic tokenizer.
        max_length (int): Maximum sequence length for padding/truncation (default: 128).
        sentence_level (bool): If True, returns sentence segmentation using spaCy.

    Returns:
        Dataset, DatasetDict, or DataFrame with tokenized output.
    """
    if model:
        tokenizer = AutoTokenizer.from_pretrained(model)  # Load HF tokenizer
    else:
        tokenizer = None  # Use basic tokenization instead

    if isinstance(dataset, pd.DataFrame):
        if column not in dataset.columns:
            raise ValueError(f"Column '{column}' not found in DataFrame.")

        def process_text(text):
            if tokenizer:
                tokenized_output = tokenizer(
                    text,
                    padding="max_length",  # Pads all sequences to max_length
                    truncation=True,       # Cuts off longer sequences
                    max_length=max_length, # Defines max sequence length
                )
                return tokenized_output["input_ids"]  # Returns token IDs
            else:
                return basic_tokenize(text)  # Simple whitespace tokenization
        
        dataset[column + "_tokens"] = dataset[column].apply(process_text)
        return dataset

    elif isinstance(dataset, (Dataset, DatasetDict)):
        def process_batch(batch):
            """Handles batch processing to avoid TypeErrors."""
            texts = batch[column]
            if tokenizer:
                tokenized_outputs = tokenizer(
                    texts,
                    padding="max_length",
                    truncation=True,
                    max_length=max_length,
                    return_tensors=None  # Ensure we return Python lists
                )
                token_ids = tokenized_outputs["input_ids"]
                tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in token_ids]
            else:
                tokens = [basic_tokenize(text) for text in texts]
                token_ids = None  # Not needed for basic tokenization

            return {
                column + "_tokens": tokens,
                column + "_token_ids": token_ids if token_ids else [None] * len(tokens),
                column + "_sentences": [[sent.text for sent in nlp(text).sents] if sentence_level else None for text in texts]
            }

        return dataset.map(process_batch, batched=True)

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")


### Testing the Tokenizer

In [16]:
# Apply tokenization with the updated function
sample_imdb_tokenized_model = tokenize_text(sample_imdb, column="text", model="bert-base-uncased")
sample_imdb_tokenized_simple = tokenize_text(sample_imdb, column="text", model=None)
sample_ag_news_tokenized = tokenize_text(sample_ag_news, column="text")
sample_twitter_tokenized = tokenize_text(sample_twitter, column="text", sentence_level=True)

# Print results
print("\nIMDb Tokenized Example BERT:", sample_imdb_tokenized_model[0]["text_tokens"])
print("\nIMDb Tokenized Example Simple:", sample_imdb_tokenized_simple[0]["text_tokens"])
print("\nAG News Tokenized Example:", sample_ag_news_tokenized[0]["text_tokens"])
print("\nTwitter (Sentence-Level) Example:", sample_twitter_tokenized[0]["text_sentences"])


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


IMDb Tokenized Example BERT: ['[CLS]', 'rent', 'curious', '##ye', '##llo', '##w', 'video', 'store', 'controversy', 'surround', 'release', '1967', 'hear', 'seize', 'you', '##s', 'custom', 'try', 'enter', 'country', 'fan', 'film', 'consider', 'controversial', 'myself', '##the', 'plot', 'center', 'young', 'swedish', 'drama', 'student', 'name', 'lena', 'want', 'learn', 'life', 'particular', 'want', 'focus', 'attention', 'make', 'sort', 'documentary', 'average', 'sw', '##ede', 'think', 'certain', 'political', 'issue', 'vietnam', 'war', 'race', 'issue', 'united', 'states', 'ask', 'politician', 'ordinary', 'den', '##ize', '##n', 'stockholm', 'opinion', 'pol', '##itic', 'sex', 'drama', 'teacher', 'classmate', 'marry', 'men', '##w', '##hat', 'kill', 'curious', '##ye', '##llo', '##w', '40', 'year', 'ago', 'consider', 'pornographic', 'sex', 'nu', '##dity', 'scene', 'far', 'shoot', 'like', 'cheap', '##ly', 'porn', '##o', 'country', '##man', 'mind', 'find', 'shock', 'reality', 'sex', 'nu', '##dity

## Embeddings

In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
from tqdm import tqdm
from datasets import Dataset, DatasetDict
import pandas as pd

# Load SentenceTransformer Model (small but powerful)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


### Word Embeddings

In [9]:
def encode_words(dataset, column="text", batch_size=32):
    """
    Encodes words individually into embeddings using SentenceTransformers.

    Parameters:
        dataset (Dataset, DatasetDict, or DataFrame): The dataset to process.
        column (str): The column containing text.
        batch_size (int): Number of sentences per batch.

    Returns:
        Dataset, DatasetDict, or DataFrame with word embeddings stored.
    """
    if isinstance(dataset, pd.DataFrame):
        word_embeddings_dict = {}

        def get_word_embeddings(text):
            words = text.split()  # Tokenize
            word_embeddings = embedding_model.encode(words, batch_size=batch_size, convert_to_numpy=True)
            for word, emb in zip(words, word_embeddings):
                word_embeddings_dict[word] = emb
            return word_embeddings
        
        # Apply to DataFrame
        dataset[column + "_word_embeddings"] = dataset[column].apply(get_word_embeddings)
        return dataset, word_embeddings_dict  # Return dataset + dictionary for word reuse
    
    elif isinstance(dataset, (Dataset, DatasetDict)):
        word_embeddings_dict = {}

        def process_batch(batch):
            word_embeddings_batch = []
            for text in batch[column]:
                words = text.split()
                word_embeddings = embedding_model.encode(words, batch_size=batch_size, convert_to_numpy=True)
                word_embeddings_batch.append(word_embeddings)
                for word, emb in zip(words, word_embeddings):
                    word_embeddings_dict[word] = emb
            return {column + "_word_embeddings": word_embeddings_batch}

        dataset = dataset.map(process_batch, batched=True)
        return dataset, word_embeddings_dict

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")


### Sentence Embeddings

In [19]:
def encode_sentences(dataset, column="text", model=embedding_model, batch_size=32, num_proc=None):
    """
    Encodes sentences into embeddings using SentenceTransformers.
    Now supports parallel processing for Hugging Face datasets.

    Parameters:
        dataset (Dataset, DatasetDict, or DataFrame): The dataset to process.
        column (str): The column containing text.
        model: Pre-loaded SentenceTransformer model (default: embedding_model).
        batch_size (int): Number of sentences per batch.
        num_proc (int, optional): Number of processes to use for multiprocessing (Only for Hugging Face datasets).
                                  Set `num_proc=-1` to use all available CPU cores.

    Returns:
        Dataset, DatasetDict, or DataFrame with embeddings stored.
    """
    if isinstance(dataset, pd.DataFrame):
        # Convert column to list
        sentences = dataset[column].tolist()
        # Generate embeddings in batches using the provided model
        embeddings = model.encode(sentences, batch_size=batch_size, convert_to_numpy=True)
        # Store embeddings
        dataset[column + "_embeddings"] = list(embeddings)
        return dataset

    elif isinstance(dataset, (Dataset, DatasetDict)):
        # Parallel Processing enabled for Hugging Face Datasets
        num_proc = num_proc if num_proc else 1  # Default to 1 process if not set

        return dataset.map(
            lambda x: {column + "_embeddings": model.encode(x[column], batch_size=batch_size, convert_to_numpy=True)},
            batched=True,
            num_proc=num_proc  # This enables multiprocessing
        )

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")


### Testing the Embeddings

In [None]:
# Apply sentence-level embeddings
sample_imdb_embed = encode_sentences(sample_imdb, column="text")
sample_ag_news_embed = encode_sentences(sample_ag_news, column="text")
sample_twitter_embed = encode_sentences(sample_twitter, column="text")

# Apply word-level embeddings
sample_imdb_word_embed, imdb_word_dict = encode_words(sample_imdb, column="text")
sample_ag_news_word_embed, ag_news_word_dict = encode_words(sample_ag_news, column="text")
sample_twitter_word_embed, twitter_word_dict = encode_words(sample_twitter, column="text")

# Example: Accessing Sentence-Level Embeddings
print("\nIMDb Sentence Embedding:", sample_imdb_embed["text_embeddings"][0])

# Example: Accessing Word-Level Embeddings
print("\nWord Embedding for 'movie' (if present):", imdb_word_dict.get("movie", "Not Found"))


## Embedding Matrix Setup 

In [11]:
import fasttext

# Load the FastText model using the absolute path
fasttext_model = fasttext.load_model("C:/Users/bhall/cc.en.300.bin")

'''# Test: Get word vector for a word
word_vector = fasttext_model.get_word_vector("hello")
print(word_vector[:10])  # Print first 10 values'''


[ 0.15757619  0.04378209 -0.00451272  0.06659314  0.07703468  0.00485855
  0.00819822  0.00652403  0.009259    0.0353899 ]


### Word Embedding Matrix

In [4]:
import numpy as np

def create_embedding_matrix(fasttext_model, word_index, embedding_dim=300):
    """
    Creates an embedding matrix using a pre-trained FastText model.

    Args:
        fasttext_model: Loaded FastText model (`fasttext_model = fasttext.load_model(...)`).
        word_index (dict): Tokenizer's word-to-index mapping.
        embedding_dim (int): Dimensionality of FastText embeddings (default: 300).

    Returns:
        np.ndarray: Embedding matrix of shape (vocab_size + 1, embedding_dim).
    """
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, index in word_index.items():
        if index >= vocab_size:  # Ignore words exceeding vocab limit
            continue

        embedding_vector = fasttext_model.get_word_vector(word)
        embedding_matrix[index] = embedding_vector  # Assign to matrix

    return embedding_matrix
    

### Testing the word embeddings

In [5]:
# Example Usage
word_index = {"hello": 1, "world": 2, "example": 3}  # Replace with actual tokenizer.word_index
embedding_matrix = create_embedding_matrix(fasttext_model, word_index)

print("Shape of embedding matrix:", embedding_matrix.shape)
print("Example vector for 'hello':", embedding_matrix[1][:10])  # First 10 values

Shape of embedding matrix: (4, 300)
Example vector for 'hello': [ 0.15757619  0.04378209 -0.00451272  0.06659314  0.07703468  0.00485855
  0.00819822  0.00652403  0.009259    0.0353899 ]


### Sentence Embedding Matrix

In [17]:
def get_sentence_embeddings(sentences, model=embedding_model, batch_size=32):
    """
    Converts a list of sentences into sentence embeddings.

    Args:
        sentences (list): List of text sentences.
        model: Pre-loaded SentenceTransformer model (default: embedding_model).
        batch_size (int): Number of sentences per batch.

    Returns:
        np.ndarray: Sentence embeddings (num_sentences, embedding_dim).
    """
    return model.encode(sentences, batch_size=batch_size, convert_to_numpy=True)


### Testing the sentence embeddings

In [18]:
# Example Usage - Hugging Face
sentences = ["This is a test.", "Sentence embeddings are useful."]
sentence_embeddings = get_sentence_embeddings(sentences)

print("Sentence embeddings shape:", sentence_embeddings.shape)

# Example Useage - PD Dataset
sample_df = pd.DataFrame({"text": ["I love AI.", "Transformers are amazing.", "Sentence embeddings are powerful."]})
sample_df = encode_sentences(sample_df, column="text")

print("\nExample Sentence Embedding:", sample_df["text_embeddings"][0][:10])  # Print first 10 values of first embedding

Sentence embeddings shape: (2, 384)

Example Sentence Embedding: [-0.02230717 -0.06976169  0.03644417 -0.03894086  0.04212659 -0.03013989
  0.06765563  0.0234887   0.06278943  0.02705796]
