## Loading the Datasets for testing

In [1]:
from datasets import load_dataset

# Load IMDb (Sentiment Analysis)
imdb_dataset = load_dataset("imdb")

# Load AG News (News Classification)
ag_news_dataset = load_dataset("ag_news")

# Load Sentiment140 (Twitter Sentiment)
twitter_dataset = load_dataset("sentiment140", trust_remote_code=True)

# Print Dataset Samples
print("\nIMDb Example:", imdb_dataset["train"][0])
print("\nAG News Example:", ag_news_dataset["train"][0])
print("\nTwitter Example:", twitter_dataset["train"][0])



IMDb Example: {'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are

## Cleaning the Data

In [2]:
import re
import spacy
import contractions
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from joblib import Parallel, delayed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# --- Cleaning Setup ---
nlp_clean = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp_clean.add_pipe("sentencizer")
STOPWORDS = nlp_clean.Defaults.stop_words

# Define common informal words & slang
INFORMAL_WORDS = {
    "gonna": "going to", "wanna": "want to", "gotta": "got to",
    "shoulda": "should have", "coulda": "could have", "woulda": "would have",
    "lemme": "let me", "gimme": "give me", "outta": "out of",
    "dunno": "do not know", "kinda": "kind of", "sorta": "sort of",
    "ain't": "is not", "ya": "you", "tho": "though", "til": "until",
    "cuz": "because", "coz": "because", "idk": "I do not know",
    "tbh": "to be honest", "btw": "by the way", "u": "you", "ur": "your",
    "r": "are"
}

def expand_informal(text):
    """Replaces informal words using our predefined dictionary."""
    words = text.split()
    words = [INFORMAL_WORDS[word] if word in INFORMAL_WORDS else word for word in words]
    return " ".join(words)

# Stopword list
STOPWORDS = nlp.Defaults.stop_words

def clean_text(text, remove_punctuation=True, remove_stopwords=True, lemmatize=True, sentence_level=False, display=False):
    """
    Cleans raw text by applying several preprocessing steps:
    - Lowercasing and trimming
    - Removing URLs, emails, and HTML tags
    - Expanding contractions and informal words
    - Normalizing whitespace and removing non-ASCII characters
    - Optionally removing punctuation, stopwords, and lemmatizing
    - Optionally performing sentence-level cleaning via spaCy's sentencizer

    Parameters:
        text (str): Raw text input.
        remove_punctuation (bool): Whether to remove punctuation.
        remove_stopwords (bool): Whether to remove stopwords.
        lemmatize (bool): Whether to lemmatize tokens.
        sentence_level (bool): If True, clean text at sentence level.
        display (bool): If True, prints brief debug info.

    Returns:
        str: Cleaned text.
    """
    if text is None or text.strip() == "":
        if display:
            print("[DEBUG] Empty text provided; returning an empty string.")
        return ""
    
    original_text = text  # Store original for comparison

    # Lowercase and strip text
    text = text.lower().strip()

    # Remove URLs, emails, and HTML tags
    text = re.sub(r"http[s]?://\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"<.*?>", "", text)

    # Expand contractions
    text = contractions.fix(text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Expand informal words
    text = expand_informal(text)

    # Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", " ", text)

    # Remove punctuation if enabled
    if remove_punctuation:
        text = re.sub(r"[^\w\s]", "", text)

    # Tokenize using spaCy
    doc = nlp_clean(text)

    if sentence_level:
        # Process at sentence-level
        sentences = []
        for sent in doc.sents:
            tokens = [
                token.lemma_ if lemmatize else token.text
                for token in sent
                if not (remove_stopwords and token.text in STOPWORDS)
            ]
            sentences.append(" ".join(tokens))
        processed_text = " ".join(sentences)
    else:
        # Process whole text as a single unit
        tokens = [
            token.lemma_ if lemmatize else token.text
            for token in doc
            if not (remove_stopwords and token.text in STOPWORDS)
        ]
        processed_text = " ".join(tokens)
    
    # Debug output
    if display:
        print(f"[INFO] Original text (first 50 chars): {original_text[:50]}")
        print(f"[INFO] Cleaned text (first 50 chars): {processed_text[:50]}")
        print(f"[INFO] Original length: {len(original_text)}; Cleaned length: {len(processed_text)}")
        if not processed_text:
            print("[WARNING] Cleaning resulted in an empty string.")

    return processed_text

def process_text_batch(texts, remove_punctuation=True, remove_stopwords=True, lemmatize=True, sentence_level=False, max_jobs=-1, display=False):
    """
    Applies text cleaning in parallel to a batch of texts.
    
    Parameters:
        texts (list): List of raw text strings.
        remove_punctuation, remove_stopwords, lemmatize, sentence_level: Passed to clean_text.
        max_jobs (int): Number of parallel jobs (-1 for all cores).
        display (bool): If True, passes display flag to clean_text for debug info.

    Returns:
        list: List of cleaned texts.
    """
    return Parallel(n_jobs=max_jobs)(
        delayed(clean_text)(text, remove_punctuation, remove_stopwords, lemmatize, sentence_level, display=display)
        for text in texts
    )

def apply_cleaning(dataset, columns, batch_size=1000, sentence_level=False, max_jobs=1, display=False):
    """
    Applies text cleaning to either a Hugging Face dataset or a pandas DataFrame.

    Parameters:
        dataset (Dataset, DatasetDict, or pd.DataFrame): The dataset to process.
        columns (str or list): Column(s) to clean.
        batch_size (int): Batch size for processing.
        sentence_level (bool or dict): If True, applies sentence-level cleaning for all columns.
                                       If a dict, specify per-column settings (e.g., {"text": True, "summary": False}).
        max_jobs (int): Number of parallel jobs (-1 for max cores).
        display (bool): If True, prints out info during processing.

    Returns:
        Dataset, DatasetDict, or DataFrame with cleaned text.
    """
    if isinstance(columns, str):
        columns = [columns]  # Convert to list if a single column is given

    if isinstance(dataset, pd.DataFrame):
        for column in columns:
            if column not in dataset.columns:
                raise ValueError(f"Column '{column}' not found in DataFrame.")

        total_rows = len(dataset)
        if display:
            print(f"[INFO] Starting cleaning on DataFrame with {total_rows} rows and columns: {columns}")

        # Only show per-batch info if the dataset is small (e.g., fewer than 1000 rows)
        show_batch_info = display and total_rows < 1000

        cleaned_texts = {col: [] for col in columns}
        for i in tqdm(range(0, total_rows, batch_size), desc="Cleaning DataFrame"):
            batch = dataset.iloc[i:i+batch_size]
            for column in columns:
                # Determine the sentence level setting for the current column
                col_sentence_level = sentence_level[column] if isinstance(sentence_level, dict) else sentence_level
                if show_batch_info:
                    print(f"[INFO] Cleaning batch rows {i} to {i+batch_size} in column '{column}' (sentence_level={col_sentence_level})")
                cleaned_batch = process_text_batch(
                    batch[column].tolist(),
                    sentence_level=col_sentence_level,
                    max_jobs=max_jobs,
                    display=display  # passes through to clean_text for internal debug info
                )
                cleaned_texts[column].extend(cleaned_batch)

        for column in columns:
            dataset[column] = cleaned_texts[column]
        if display:
            print(f"[INFO] DataFrame cleaning complete. Processed {total_rows} rows.")
        return dataset

    elif isinstance(dataset, (Dataset, DatasetDict)):
        if display:
            print(f"[INFO] Starting cleaning on Hugging Face dataset for columns: {columns}")
        cleaned_dataset = dataset.map(
            lambda batch: {
                col: [
                    clean_text(
                        text,
                        sentence_level=(sentence_level[col] if isinstance(sentence_level, dict) else sentence_level),
                        display=display
                    )
                    for text in batch[col]
                ]
                for col in columns
            },
            batched=True
        )
        if display:
            print("[INFO] Hugging Face dataset cleaning complete.")
        return cleaned_dataset

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")


### Testing the cleaning function

In [3]:
# Load Sample Data
sample_imdb = imdb_dataset["train"].select(range(5))
sample_ag_news = ag_news_dataset["train"].select(range(5))
sample_twitter = twitter_dataset["train"].select(range(5))

# Apply Text Cleaning (Word-Level)
sample_imdb = apply_cleaning(sample_imdb, columns="text")
sample_ag_news = apply_cleaning(sample_ag_news, columns="text")
sample_twitter = apply_cleaning(sample_twitter, columns="text")

# Apply Text Cleaning to Multiple Columns (Sentence-Level Example)
# sample_imdb_multi = apply_cleaning(sample_imdb, columns=["text"], sentence_level={"text": True})

# Print Cleaned Examples
print("\nIMDb Cleaned Example:", sample_imdb[0]["text"])
print("\nAG News Cleaned Example:", sample_ag_news[0]["text"])
print("\nTwitter Cleaned Example:", sample_twitter[0]["text"])
# print("\nIMDb (Sentence-Level) Cleaned Example:", sample_imdb_multi[0]["text"])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


IMDb Cleaned Example: rent curiousyellow video store controversy surround release 1967 hear seize yous custom try enter country fan film consider controversial myselfthe plot center young swedish drama student name lena want learn life particular want focus attention make sort documentary average swede think certain political issue vietnam war race issue united states ask politician ordinary denizen stockholm opinion politic sex drama teacher classmate marry menwhat kill curiousyellow 40 year ago consider pornographic sex nudity scene far shoot like cheaply porno countryman mind find shock reality sex nudity major staple swedish cinema ingmar bergman arguably answer good old boy john ford sex scene filmsi commend filmmaker fact sex show film show artistic purpose shock people money show pornographic theater america curiousyellow good film want study meat potato pun intend swedish cinema film plot

AG News Cleaned Example: wall st bears claw black reuters reuters   shortsellers wall st

## Tokenization

In [15]:
import re
import spacy
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import pandas as pd

# Load SpaCy model
nlp_token = spacy.load("en_core_web_sm")

def basic_tokenize(text):
    """Basic whitespace-based tokenizer as a fallback."""
    return re.findall(r"\b\w+\b", text.lower())  # Simple word tokenization

def tokenize_text(dataset, column="text", model=None, max_length=128, sentence_level=False, display=False):
    """
    Tokenizes text using either a Hugging Face pre-trained model or a simple tokenizer.
    Now includes padding & truncation.

    Parameters:
        dataset (Dataset, DatasetDict, or pd.DataFrame): Dataset to tokenize.
        column (str): The text column to process.
        model (str or None): Pre-trained tokenizer model (e.g., 'bert-base-uncased'). 
                             If None, uses a basic tokenizer.
        max_length (int): Maximum sequence length for padding/truncation (default: 128).
        sentence_level (bool): If True, returns sentence segmentation using spaCy.
        display (bool): If True, prints a final status message after tokenization.

    Returns:
        Dataset, DatasetDict, or DataFrame with tokenized output.
    """
    if model:
        tokenizer = AutoTokenizer.from_pretrained(model)  # Load Hugging Face tokenizer
    else:
        tokenizer = None  # Use basic tokenization instead

    if isinstance(dataset, pd.DataFrame):
        if column not in dataset.columns:
            raise ValueError(f"Column '{column}' not found in DataFrame.")

        def process_text(text):
            if tokenizer:
                # Tokenize using HF tokenizer and extract input_ids
                tokenized_output = tokenizer(
                    text,
                    padding="max_length",  # Pads to max_length
                    truncation=True,       # Truncates long sequences
                    max_length=max_length, 
                )
                # Basic check: ensure token IDs are returned
                token_ids = tokenized_output.get("input_ids", [])
                if not token_ids:
                    print(f"[WARNING] Tokenization produced empty token ids for text: {text[:30]}...")
                return token_ids
            else:
                return basic_tokenize(text)  # Fallback basic tokenization

        dataset[column + "_token_ids"] = dataset[column].apply(process_text)
        if display:
            print(f"[INFO] Tokenization completed on DataFrame: {len(dataset)} rows processed.")
        return dataset

    elif isinstance(dataset, (Dataset, DatasetDict)):
        def process_batch(batch):
            """Handles batch processing to avoid type errors."""
            texts = batch[column]
            if tokenizer:
                tokenized_outputs = tokenizer(
                    texts,
                    padding="max_length",
                    truncation=True,
                    max_length=max_length,
                    return_tensors=None  # Return Python lists
                )
                token_ids = tokenized_outputs.get("input_ids", [])
                tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in token_ids]
            else:
                tokens = [basic_tokenize(text) for text in texts]
                token_ids = None  # Not used for basic tokenization

            # Process sentence segmentation if requested
            sentences = []
            if sentence_level:
                for text in texts:
                    sents = [sent.text for sent in nlp(text).sents]
                    sentences.append(sents)
            else:
                sentences = [None] * len(texts)

            return {
                column + "_tokens": tokens,
                column + "_token_ids": token_ids if token_ids else [None] * len(tokens),
                column + "_sentences": sentences
            }

        tokenized_dataset = dataset.map(process_batch, batched=True)
        if display:
            print(f"[INFO] Tokenization completed on Hugging Face dataset for column '{column}'.")
        return tokenized_dataset

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")


### Testing the Tokenizer

In [16]:
# Apply tokenization with the updated function
sample_imdb_tokenized_model = tokenize_text(sample_imdb, column="text", model="bert-base-uncased")
sample_imdb_tokenized_simple = tokenize_text(sample_imdb, column="text", model=None)
sample_ag_news_tokenized = tokenize_text(sample_ag_news, column="text")
sample_twitter_tokenized = tokenize_text(sample_twitter, column="text", sentence_level=True)

# Print results
print("\nIMDb Tokenized Example BERT:", sample_imdb_tokenized_model[0]["text_tokens"])
print("\nIMDb Tokenized Example Simple:", sample_imdb_tokenized_simple[0]["text_tokens"])
print("\nAG News Tokenized Example:", sample_ag_news_tokenized[0]["text_tokens"])
print("\nTwitter (Sentence-Level) Example:", sample_twitter_tokenized[0]["text_sentences"])


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


IMDb Tokenized Example BERT: ['[CLS]', 'rent', 'curious', '##ye', '##llo', '##w', 'video', 'store', 'controversy', 'surround', 'release', '1967', 'hear', 'seize', 'you', '##s', 'custom', 'try', 'enter', 'country', 'fan', 'film', 'consider', 'controversial', 'myself', '##the', 'plot', 'center', 'young', 'swedish', 'drama', 'student', 'name', 'lena', 'want', 'learn', 'life', 'particular', 'want', 'focus', 'attention', 'make', 'sort', 'documentary', 'average', 'sw', '##ede', 'think', 'certain', 'political', 'issue', 'vietnam', 'war', 'race', 'issue', 'united', 'states', 'ask', 'politician', 'ordinary', 'den', '##ize', '##n', 'stockholm', 'opinion', 'pol', '##itic', 'sex', 'drama', 'teacher', 'classmate', 'marry', 'men', '##w', '##hat', 'kill', 'curious', '##ye', '##llo', '##w', '40', 'year', 'ago', 'consider', 'pornographic', 'sex', 'nu', '##dity', 'scene', 'far', 'shoot', 'like', 'cheap', '##ly', 'porn', '##o', 'country', '##man', 'mind', 'find', 'shock', 'reality', 'sex', 'nu', '##dity

## Embeddings

In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
from tqdm import tqdm
from datasets import Dataset, DatasetDict
import pandas as pd

# Load SentenceTransformer Model (small but powerful)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load spaCy model with vectors (make sure you have en_core_web_md installed)
nlp_vec = spacy.load("en_core_web_md")


### Word Embeddings

In [9]:
def encode_words(dataset, column="text", batch_size=32, display=False):
    """
    Encodes words individually into embeddings using spaCy's word vectors.

    Parameters:
        dataset (Dataset, DatasetDict, or pd.DataFrame): The dataset to process.
        column (str): The column containing text.
        batch_size (int): Placeholder for compatibility (unused here).
        display (bool): If True, prints a final summary message after encoding.

    Returns:
        (processed_dataset, word_embeddings_dict)
            processed_dataset: The dataset with an added column (e.g. "text_word_embeddings") containing lists of embeddings.
            word_embeddings_dict: A dictionary mapping each encountered word to its embedding.
    """
    word_embeddings_dict = {}

    def get_word_embeddings(text):
        doc = nlp_vec(text)
        embeddings = []
        for token in doc:
            word = token.text
            emb = token.vector  # Get spaCy's vector for this token
            embeddings.append(emb)
            word_embeddings_dict[word] = emb  # Update dictionary (duplicates get overwritten with identical vectors)
        return embeddings

    if isinstance(dataset, pd.DataFrame):
        # Apply function to every row in the DataFrame column
        dataset[column + "_word_embeddings"] = dataset[column].apply(get_word_embeddings)
        if display:
            print(f"[INFO] DataFrame processing complete. Processed {len(dataset)} rows.")
            print(f"[INFO] Vocabulary size (unique words embedded): {len(word_embeddings_dict)}")
        return dataset, word_embeddings_dict

    elif isinstance(dataset, (Dataset, DatasetDict)):
        def process_batch(batch):
            embeddings_batch = []
            for text in batch[column]:
                word_embeds = get_word_embeddings(text)
                embeddings_batch.append(word_embeds)
            return {column + "_word_embeddings": embeddings_batch}

        processed_dataset = dataset.map(process_batch, batched=True)
        if display:
            total = processed_dataset.num_rows if hasattr(processed_dataset, "num_rows") else "unknown"
            print(f"[INFO] Hugging Face dataset processing complete. Processed {total} rows.")
            print(f"[INFO] Vocabulary size (unique words embedded): {len(word_embeddings_dict)}")
        return processed_dataset, word_embeddings_dict

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")
        

### Sentence Embeddings

In [19]:
def encode_sentences(dataset, column="text", model=embedding_model, batch_size=32, num_proc=None):
    """
    Encodes sentences into embeddings using SentenceTransformers.
    Now supports parallel processing for Hugging Face datasets.

    Parameters:
        dataset (Dataset, DatasetDict, or DataFrame): The dataset to process.
        column (str): The column containing text.
        model: Pre-loaded SentenceTransformer model (default: embedding_model).
        batch_size (int): Number of sentences per batch.
        num_proc (int, optional): Number of processes to use for multiprocessing (Only for Hugging Face datasets).
                                  Set `num_proc=-1` to use all available CPU cores.

    Returns:
        Dataset, DatasetDict, or DataFrame with embeddings stored.
    """
    if isinstance(dataset, pd.DataFrame):
        # Convert column to list
        sentences = dataset[column].tolist()
        # Generate embeddings in batches using the provided model
        embeddings = model.encode(sentences, batch_size=batch_size, convert_to_numpy=True)
        # Store embeddings
        dataset[column + "_embeddings"] = list(embeddings)
        return dataset

    elif isinstance(dataset, (Dataset, DatasetDict)):
        # Parallel Processing enabled for Hugging Face Datasets
        num_proc = num_proc if num_proc else 1  # Default to 1 process if not set

        return dataset.map(
            lambda x: {column + "_embeddings": model.encode(x[column], batch_size=batch_size, convert_to_numpy=True)},
            batched=True,
            num_proc=num_proc  # This enables multiprocessing
        )

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")


### Testing the Embeddings

In [None]:
# Apply sentence-level embeddings
sample_imdb_embed = encode_sentences(sample_imdb, column="text")
sample_ag_news_embed = encode_sentences(sample_ag_news, column="text")
sample_twitter_embed = encode_sentences(sample_twitter, column="text")

# Apply word-level embeddings
sample_imdb_word_embed, imdb_word_dict = encode_words(sample_imdb, column="text")
sample_ag_news_word_embed, ag_news_word_dict = encode_words(sample_ag_news, column="text")
sample_twitter_word_embed, twitter_word_dict = encode_words(sample_twitter, column="text")

# Example: Accessing Sentence-Level Embeddings
print("\nIMDb Sentence Embedding:", sample_imdb_embed["text_embeddings"][0])

# Example: Accessing Word-Level Embeddings
print("\nWord Embedding for 'movie' (if present):", imdb_word_dict.get("movie", "Not Found"))


## Embedding Matrix Setup 

In [11]:
import fasttext

# Load the FastText model using the absolute path
fasttext_model = fasttext.load_model("C:/Users/bhall/cc.en.300.bin")

'''# Test: Get word vector for a word
word_vector = fasttext_model.get_word_vector("hello")
print(word_vector[:10])  # Print first 10 values'''


[ 0.15757619  0.04378209 -0.00451272  0.06659314  0.07703468  0.00485855
  0.00819822  0.00652403  0.009259    0.0353899 ]


### Word Embedding Matrix

In [4]:
import numpy as np

def create_embedding_matrix(fasttext_model, word_index, embedding_dim=300, display=False):
    """
    Creates an embedding matrix using a pre-trained FastText model.

    Args:
        fasttext_model: Loaded FastText model.
        word_index (dict): Tokenizer's word-to-index mapping.
        embedding_dim (int): Dimensionality of FastText embeddings (default: 300).
        display (bool): If True, prints a final summary message.

    Returns:
        np.ndarray: Embedding matrix of shape (vocab_size, embedding_dim).
    """
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, index in word_index.items():
        if index >= vocab_size:  # Ignore words exceeding vocab limit
            continue
        embedding_vector = fasttext_model.get_word_vector(word)
        embedding_matrix[index] = embedding_vector  # Assign to matrix

    if display:
        print(f"[INFO] Created embedding matrix with shape: {embedding_matrix.shape}")

    return embedding_matrix
    

### Testing the word embeddings

In [5]:
# Example Usage
word_index = {"hello": 1, "world": 2, "example": 3}  # Replace with actual tokenizer.word_index
embedding_matrix = create_embedding_matrix(fasttext_model, word_index)

print("Shape of embedding matrix:", embedding_matrix.shape)
print("Example vector for 'hello':", embedding_matrix[1][:10])  # First 10 values

Shape of embedding matrix: (4, 300)
Example vector for 'hello': [ 0.15757619  0.04378209 -0.00451272  0.06659314  0.07703468  0.00485855
  0.00819822  0.00652403  0.009259    0.0353899 ]


### Sentence Embedding Matrix

In [17]:
def get_sentence_embeddings(sentences, model=embedding_model, batch_size=32):
    """
    Converts a list of sentences into sentence embeddings.

    Args:
        sentences (list): List of text sentences.
        model: Pre-loaded SentenceTransformer model (default: embedding_model).
        batch_size (int): Number of sentences per batch.

    Returns:
        np.ndarray: Sentence embeddings (num_sentences, embedding_dim).
    """
    return model.encode(sentences, batch_size=batch_size, convert_to_numpy=True)


### Testing the sentence embeddings

In [18]:
# Example Usage - Hugging Face
sentences = ["This is a test.", "Sentence embeddings are useful."]
sentence_embeddings = get_sentence_embeddings(sentences)

print("Sentence embeddings shape:", sentence_embeddings.shape)

# Example Useage - PD Dataset
sample_df = pd.DataFrame({"text": ["I love AI.", "Transformers are amazing.", "Sentence embeddings are powerful."]})
sample_df = encode_sentences(sample_df, column="text")

print("\nExample Sentence Embedding:", sample_df["text_embeddings"][0][:10])  # Print first 10 values of first embedding

Sentence embeddings shape: (2, 384)

Example Sentence Embedding: [-0.02230717 -0.06976169  0.03644417 -0.03894086  0.04212659 -0.03013989
  0.06765563  0.0234887   0.06278943  0.02705796]


## Converting data to tensors and prepping for model

In [None]:
import torch
import numpy as np
import pandas as pd

def convert_to_tensors(dataset, columns, labels_column=None, display=False):
    """
    Converts specified dataset columns to PyTorch tensors.

    Args:
        dataset (Dataset, DatasetDict, or pd.DataFrame): The dataset to process.
        columns (list): List of columns to convert to tensors.
        labels_column (str, optional): Column containing labels (for supervised tasks).
        display (bool, optional): If True, prints out summary info after conversion.

    Returns:
        dict: Dictionary containing tensors for each specified column.
    """
    tensor_dict = {}

    for col in columns:
        data = dataset[col]

        # If data is a pandas Series, force each element to be a list
        if isinstance(data, pd.Series):
            data = data.apply(lambda x: list(x) if not isinstance(x, list) else x)
            data = list(data)  # Convert Series into a list

        if isinstance(data, (list, np.ndarray)):
            if col.endswith("_token_ids"):
                data = torch.tensor(data, dtype=torch.long)
            else:
                data = torch.tensor(data, dtype=torch.float32)
        else:
            raise ValueError(f"Unsupported data type for column: {col}")

        tensor_dict[col] = data

    if labels_column:
        labels = torch.tensor(dataset[labels_column].values, dtype=torch.long)
        tensor_dict["labels"] = labels

    if display:
        print("[INFO] Conversion to tensors complete. Summary:")
        for key, tensor in tensor_dict.items():
            print(f"  {key}: shape = {tensor.shape}")

    return tensor_dict


### Dataset/DataLoader Creation

In [1]:
from torch.utils.data import Dataset

class CustomTextDataset(Dataset):
    def __init__(self, tensor_dict):
        """
        Custom PyTorch Dataset for text-based data.

        Args:
            tensor_dict (dict): Dictionary containing tensors (including optional labels).
        """
        self.data = tensor_dict
        self.keys = list(tensor_dict.keys())

    def __len__(self):
        # Assumes all tensor entries have the same length
        return len(self.data[self.keys[0]])

    def __getitem__(self, idx):
        sample = {key: self.data[key][idx] for key in self.keys}
        return sample


In [2]:
from torch.utils.data import DataLoader

def create_dataloader(tensor_dict, batch_size=32, shuffle=True, display=False):
    """
    Creates a DataLoader for batch processing.

    Args:
        tensor_dict (dict): Dictionary of tensors (from convert_to_tensors).
        batch_size (int): Number of samples per batch.
        shuffle (bool): Whether to shuffle data.
        display (bool, optional): If True, prints out summary info after creation.

    Returns:
        DataLoader: PyTorch DataLoader object.
    """
    dataset = CustomTextDataset(tensor_dict)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    
    if display:
        num_samples = len(dataset)
        num_batches = len(dataloader)
        print(f"[INFO] DataLoader created: {num_samples} samples, batch size = {batch_size}, total batches = {num_batches}")

    return dataloader
    

In [None]:
# Example usage: Assume `processed_dataset` is our cleaned + tokenized dataset
columns_to_convert = ["text_token_ids", "text_embeddings"]  # Example columns
tensor_dict = convert_to_tensors(processed_dataset, columns_to_convert, labels_column="label")

# Create DataLoader
dataloader = create_dataloader(tensor_dict, batch_size=32)

# Example: Iterate through DataLoader
for batch in dataloader:
    print(batch["text_token_ids"].shape, batch["text_embeddings"].shape, batch["labels"].shape)
    break  # Print one batch for checking


## Copy Paste Template - Contains all the templates (word embedding version)

In [1]:
import re
import spacy
import contractions
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from joblib import Parallel, delayed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# --- Cleaning Setup ---
nlp_clean = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp_clean.add_pipe("sentencizer")
STOPWORDS = nlp_clean.Defaults.stop_words

# Define common informal words & slang
INFORMAL_WORDS = {
    "gonna": "going to", "wanna": "want to", "gotta": "got to",
    "shoulda": "should have", "coulda": "could have", "woulda": "would have",
    "lemme": "let me", "gimme": "give me", "outta": "out of",
    "dunno": "do not know", "kinda": "kind of", "sorta": "sort of",
    "ain't": "is not", "ya": "you", "tho": "though", "til": "until",
    "cuz": "because", "coz": "because", "idk": "I do not know",
    "tbh": "to be honest", "btw": "by the way", "u": "you", "ur": "your",
    "r": "are"
}

def expand_informal(text):
    """Replaces informal words using our predefined dictionary."""
    words = text.split()
    words = [INFORMAL_WORDS[word] if word in INFORMAL_WORDS else word for word in words]
    return " ".join(words)
    

def clean_text(text, remove_punctuation=True, remove_stopwords=True, lemmatize=True, sentence_level=False, display=False):
    """
    Cleans raw text by applying several preprocessing steps:
    - Lowercasing and trimming
    - Removing URLs, emails, and HTML tags
    - Expanding contractions and informal words
    - Normalizing whitespace and removing non-ASCII characters
    - Optionally removing punctuation, stopwords, and lemmatizing
    - Optionally performing sentence-level cleaning via spaCy's sentencizer

    Parameters:
        text (str): Raw text input.
        remove_punctuation (bool): Whether to remove punctuation.
        remove_stopwords (bool): Whether to remove stopwords.
        lemmatize (bool): Whether to lemmatize tokens.
        sentence_level (bool): If True, clean text at sentence level.
        display (bool): If True, prints brief debug info.

    Returns:
        str: Cleaned text.
    """
    if text is None or text.strip() == "":
        if display:
            print("[DEBUG] Empty text provided; returning an empty string.")
        return ""
    
    original_text = text  # Store original for comparison

    # Lowercase and strip text
    text = text.lower().strip()

    # Remove URLs, emails, and HTML tags
    text = re.sub(r"http[s]?://\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"<.*?>", "", text)

    # Expand contractions
    text = contractions.fix(text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Expand informal words
    text = expand_informal(text)

    # Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", " ", text)

    # Remove punctuation if enabled
    if remove_punctuation:
        text = re.sub(r"[^\w\s]", "", text)

    # Tokenize using spaCy
    doc = nlp_clean(text)

    if sentence_level:
        # Process at sentence-level
        sentences = []
        for sent in doc.sents:
            tokens = [
                token.lemma_ if lemmatize else token.text
                for token in sent
                if not (remove_stopwords and token.text in STOPWORDS)
            ]
            sentences.append(" ".join(tokens))
        processed_text = " ".join(sentences)
    else:
        # Process whole text as a single unit
        tokens = [
            token.lemma_ if lemmatize else token.text
            for token in doc
            if not (remove_stopwords and token.text in STOPWORDS)
        ]
        processed_text = " ".join(tokens)
    
    # Debug output
    if display:
        print(f"[INFO] Original text (first 50 chars): {original_text[:50]}")
        print(f"[INFO] Cleaned text (first 50 chars): {processed_text[:50]}")
        print(f"[INFO] Original length: {len(original_text)}; Cleaned length: {len(processed_text)}")
        if not processed_text:
            print("[WARNING] Cleaning resulted in an empty string.")

    return processed_text

def process_text_batch(texts, remove_punctuation=True, remove_stopwords=True, lemmatize=True, sentence_level=False, max_jobs=-1, display=False):
    """
    Applies text cleaning in parallel to a batch of texts.
    
    Parameters:
        texts (list): List of raw text strings.
        remove_punctuation, remove_stopwords, lemmatize, sentence_level: Passed to clean_text.
        max_jobs (int): Number of parallel jobs (-1 for all cores).
        display (bool): If True, passes display flag to clean_text for debug info.

    Returns:
        list: List of cleaned texts.
    """
    return Parallel(n_jobs=max_jobs)(
        delayed(clean_text)(text, remove_punctuation, remove_stopwords, lemmatize, sentence_level, display=display)
        for text in texts
    )

def apply_cleaning(dataset, columns, batch_size=1000, sentence_level=False, max_jobs=1, display=False):
    """
    Applies text cleaning to either a Hugging Face dataset or a pandas DataFrame.

    Parameters:
        dataset (Dataset, DatasetDict, or pd.DataFrame): The dataset to process.
        columns (str or list): Column(s) to clean.
        batch_size (int): Batch size for processing.
        sentence_level (bool or dict): If True, applies sentence-level cleaning for all columns.
                                       If a dict, specify per-column settings (e.g., {"text": True, "summary": False}).
        max_jobs (int): Number of parallel jobs (-1 for max cores).
        display (bool): If True, prints out info during processing.

    Returns:
        Dataset, DatasetDict, or DataFrame with cleaned text.
    """
    if isinstance(columns, str):
        columns = [columns]  # Convert to list if a single column is given

    if isinstance(dataset, pd.DataFrame):
        for column in columns:
            if column not in dataset.columns:
                raise ValueError(f"Column '{column}' not found in DataFrame.")

        total_rows = len(dataset)
        if display:
            print(f"[INFO] Starting cleaning on DataFrame with {total_rows} rows and columns: {columns}")

        # Only show per-batch info if the dataset is small (e.g., fewer than 1000 rows)
        show_batch_info = display and total_rows < 1000

        cleaned_texts = {col: [] for col in columns}
        for i in tqdm(range(0, total_rows, batch_size), desc="Cleaning DataFrame"):
            batch = dataset.iloc[i:i+batch_size]
            for column in columns:
                # Determine the sentence level setting for the current column
                col_sentence_level = sentence_level[column] if isinstance(sentence_level, dict) else sentence_level
                if show_batch_info:
                    print(f"[INFO] Cleaning batch rows {i} to {i+batch_size} in column '{column}' (sentence_level={col_sentence_level})")
                cleaned_batch = process_text_batch(
                    batch[column].tolist(),
                    sentence_level=col_sentence_level,
                    max_jobs=max_jobs,
                    display=display  # passes through to clean_text for internal debug info
                )
                cleaned_texts[column].extend(cleaned_batch)

        for column in columns:
            dataset[column] = cleaned_texts[column]
        if display:
            print(f"[INFO] DataFrame cleaning complete. Processed {total_rows} rows.")
        return dataset

    elif isinstance(dataset, (Dataset, DatasetDict)):
        if display:
            print(f"[INFO] Starting cleaning on Hugging Face dataset for columns: {columns}")
        cleaned_dataset = dataset.map(
            lambda batch: {
                col: [
                    clean_text(
                        text,
                        sentence_level=(sentence_level[col] if isinstance(sentence_level, dict) else sentence_level),
                        display=display
                    )
                    for text in batch[col]
                ]
                for col in columns
            },
            batched=True
        )
        if display:
            print("[INFO] Hugging Face dataset cleaning complete.")
        return cleaned_dataset

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")



import re
import spacy
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import pandas as pd

# Load SpaCy model
nlp_token = spacy.load("en_core_web_sm")

def basic_tokenize(text):
    """Basic whitespace-based tokenizer as a fallback."""
    return re.findall(r"\b\w+\b", text.lower())  # Simple word tokenization

def tokenize_text(dataset, column="text", model=None, max_length=128, sentence_level=False, display=False):
    """
    Tokenizes text using either a Hugging Face pre-trained model or a simple tokenizer.
    Now includes padding & truncation.

    Parameters:
        dataset (Dataset, DatasetDict, or pd.DataFrame): Dataset to tokenize.
        column (str): The text column to process.
        model (str or None): Pre-trained tokenizer model (e.g., 'bert-base-uncased'). 
                             If None, uses a basic tokenizer.
        max_length (int): Maximum sequence length for padding/truncation (default: 128).
        sentence_level (bool): If True, returns sentence segmentation using spaCy.
        display (bool): If True, prints a final status message after tokenization.

    Returns:
        Dataset, DatasetDict, or DataFrame with tokenized output.
    """
    if model:
        tokenizer = AutoTokenizer.from_pretrained(model)  # Load Hugging Face tokenizer
    else:
        tokenizer = None  # Use basic tokenization instead

    if isinstance(dataset, pd.DataFrame):
        if column not in dataset.columns:
            raise ValueError(f"Column '{column}' not found in DataFrame.")

        def process_text(text):
            if tokenizer:
                # Tokenize using HF tokenizer and extract input_ids
                tokenized_output = tokenizer(
                    text,
                    padding="max_length",  # Pads to max_length
                    truncation=True,       # Truncates long sequences
                    max_length=max_length, 
                )
                # Basic check: ensure token IDs are returned
                token_ids = tokenized_output.get("input_ids", [])
                if not token_ids:
                    print(f"[WARNING] Tokenization produced empty token ids for text: {text[:30]}...")
                return token_ids
            else:
                return basic_tokenize(text)  # Fallback basic tokenization

        dataset[column + "_token_ids"] = dataset[column].apply(process_text)
        if display:
            print(f"[INFO] Tokenization completed on DataFrame: {len(dataset)} rows processed.")
        return dataset

    elif isinstance(dataset, (Dataset, DatasetDict)):
        def process_batch(batch):
            """Handles batch processing to avoid type errors."""
            texts = batch[column]
            if tokenizer:
                tokenized_outputs = tokenizer(
                    texts,
                    padding="max_length",
                    truncation=True,
                    max_length=max_length,
                    return_tensors=None  # Return Python lists
                )
                token_ids = tokenized_outputs.get("input_ids", [])
                tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in token_ids]
            else:
                tokens = [basic_tokenize(text) for text in texts]
                token_ids = None  # Not used for basic tokenization

            # Process sentence segmentation if requested
            sentences = []
            if sentence_level:
                for text in texts:
                    sents = [sent.text for sent in nlp_token(text).sents]
                    sentences.append(sents)
            else:
                sentences = [None] * len(texts)

            return {
                column + "_tokens": tokens,
                column + "_token_ids": token_ids if token_ids else [None] * len(tokens),
                column + "_sentences": sentences
            }

        tokenized_dataset = dataset.map(process_batch, batched=True)
        if display:
            print(f"[INFO] Tokenization completed on Hugging Face dataset for column '{column}'.")
        return tokenized_dataset

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")



from sentence_transformers import SentenceTransformer
import numpy as np
import torch
from tqdm import tqdm
from datasets import Dataset, DatasetDict
import pandas as pd

# Load SentenceTransformer Model (small but powerful)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load spaCy model with vectors (make sure you have en_core_web_md installed)
nlp_vec = spacy.load("en_core_web_md")

def encode_words(dataset, column="text", batch_size=32, display=False):
    """
    Encodes words individually into embeddings using spaCy's word vectors.

    Parameters:
        dataset (Dataset, DatasetDict, or pd.DataFrame): The dataset to process.
        column (str): The column containing text.
        batch_size (int): Placeholder for compatibility (unused here).
        display (bool): If True, prints a final summary message after encoding.

    Returns:
        (processed_dataset, word_embeddings_dict)
            processed_dataset: The dataset with an added column (e.g. "text_word_embeddings") containing lists of embeddings.
            word_embeddings_dict: A dictionary mapping each encountered word to its embedding.
    """
    word_embeddings_dict = {}

    def get_word_embeddings(text):
        doc = nlp_vec(text)
        embeddings = []
        for token in doc:
            word = token.text
            emb = token.vector  # Get spaCy's vector for this token
            embeddings.append(emb)
            word_embeddings_dict[word] = emb  # Update dictionary (duplicates get overwritten with identical vectors)
        return embeddings

    if isinstance(dataset, pd.DataFrame):
        # Apply function to every row in the DataFrame column
        dataset[column + "_word_embeddings"] = dataset[column].apply(get_word_embeddings)
        if display:
            print(f"[INFO] DataFrame processing complete. Processed {len(dataset)} rows.")
            print(f"[INFO] Vocabulary size (unique words embedded): {len(word_embeddings_dict)}")
        return dataset, word_embeddings_dict

    elif isinstance(dataset, (Dataset, DatasetDict)):
        def process_batch(batch):
            embeddings_batch = []
            for text in batch[column]:
                word_embeds = get_word_embeddings(text)
                embeddings_batch.append(word_embeds)
            return {column + "_word_embeddings": embeddings_batch}

        processed_dataset = dataset.map(process_batch, batched=True)
        if display:
            total = processed_dataset.num_rows if hasattr(processed_dataset, "num_rows") else "unknown"
            print(f"[INFO] Hugging Face dataset processing complete. Processed {total} rows.")
            print(f"[INFO] Vocabulary size (unique words embedded): {len(word_embeddings_dict)}")
        return processed_dataset, word_embeddings_dict

    else:
        raise ValueError("Unsupported dataset type. Use a Hugging Face dataset or pandas DataFrame.")



import fasttext

# Load the FastText model using the absolute path
fasttext_model = fasttext.load_model("C:/Users/bhall/cc.en.300.bin")

'''# Test: Get word vector for a word
word_vector = fasttext_model.get_word_vector("hello")
print(word_vector[:10])  # Print first 10 values'''

import numpy as np

def create_embedding_matrix(fasttext_model, word_index, embedding_dim=300, display=False):
    """
    Creates an embedding matrix using a pre-trained FastText model.

    Args:
        fasttext_model: Loaded FastText model.
        word_index (dict): Tokenizer's word-to-index mapping.
        embedding_dim (int): Dimensionality of FastText embeddings (default: 300).
        display (bool): If True, prints a final summary message.

    Returns:
        np.ndarray: Embedding matrix of shape (vocab_size, embedding_dim).
    """
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, index in word_index.items():
        if index >= vocab_size:  # Ignore words exceeding vocab limit
            continue
        embedding_vector = fasttext_model.get_word_vector(word)
        embedding_matrix[index] = embedding_vector  # Assign to matrix

    if display:
        print(f"[INFO] Created embedding matrix with shape: {embedding_matrix.shape}")

    return embedding_matrix



import torch
import numpy as np
import pandas as pd

def convert_to_tensors(dataset, columns, labels_column=None, display=False):
    """
    Converts specified dataset columns to PyTorch tensors.

    Args:
        dataset (Dataset, DatasetDict, or pd.DataFrame): The dataset to process.
        columns (list): List of columns to convert to tensors.
        labels_column (str, optional): Column containing labels (for supervised tasks).
        display (bool, optional): If True, prints out summary info after conversion.

    Returns:
        dict: Dictionary containing tensors for each specified column.
    """
    tensor_dict = {}

    for col in columns:
        data = dataset[col]

        # If data is a pandas Series, force each element to be a list
        if isinstance(data, pd.Series):
            data = data.apply(lambda x: list(x) if not isinstance(x, list) else x)
            data = list(data)  # Convert Series into a list

        if isinstance(data, (list, np.ndarray)):
            if col.endswith("_token_ids"):
                data = torch.tensor(data, dtype=torch.long)
            else:
                data = torch.tensor(data, dtype=torch.float32)
        else:
            raise ValueError(f"Unsupported data type for column: {col}")

        tensor_dict[col] = data

    if labels_column:
        if hasattr(dataset[labels_column], "values"):
            labels = torch.tensor(dataset[labels_column].values, dtype=torch.long)
        else:
            labels = torch.tensor(dataset[labels_column], dtype=torch.long)
        tensor_dict["labels"] = labels

    if display:
        print("[INFO] Conversion to tensors complete. Summary:")
        for key, tensor in tensor_dict.items():
            print(f"  {key}: shape = {tensor.shape}")

    return tensor_dict



from torch.utils.data import Dataset

class CustomTextDataset(Dataset):
    def __init__(self, tensor_dict):
        """
        Custom PyTorch Dataset for text-based data.

        Args:
            tensor_dict (dict): Dictionary containing tensors (including optional labels).
        """
        self.data = tensor_dict
        self.keys = list(tensor_dict.keys())

    def __len__(self):
        # Assumes all tensor entries have the same length
        return len(self.data[self.keys[0]])

    def __getitem__(self, idx):
        sample = {key: self.data[key][idx] for key in self.keys}
        return sample



from torch.utils.data import DataLoader

def create_dataloader(tensor_dict, batch_size=32, shuffle=True, display=False):
    """
    Creates a DataLoader for batch processing.

    Args:
        tensor_dict (dict): Dictionary of tensors (from convert_to_tensors).
        batch_size (int): Number of samples per batch.
        shuffle (bool): Whether to shuffle data.
        display (bool, optional): If True, prints out summary info after creation.

    Returns:
        DataLoader: PyTorch DataLoader object.
    """
    dataset = CustomTextDataset(tensor_dict)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    
    if display:
        num_samples = len(dataset)
        num_batches = len(dataloader)
        print(f"[INFO] DataLoader created: {num_samples} samples, batch size = {batch_size}, total batches = {num_batches}")

    return dataloader
    

KeyboardInterrupt: 