<a href="https://colab.research.google.com/github/R-802/LING-226-Assignments/blob/main/Assignment_One.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#LING226 2023 T3 Assignment One
- Shemaiah Rangitaawa `300601546`
- Attempting Challenge

**Note:** Please ensure you are in a GPU runtime environment.

In [1]:
import string
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def clean_text(text, stop_words=None, remove_punctuation=True, to_lowercase=True):
    if to_lowercase:
        text = text.lower()

    if remove_punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    words = word_tokenize(text)

    # Ensure stop_words is a set for efficiency
    stop_words_set = set(stop_words) if stop_words else set()

    # Filter out stop words
    filtered_words = [word for word in words if word not in stop_words_set]

    return ' '.join(filtered_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## **Text Preprocessing With a Context-based Filtering Approach**

In [2]:
import torch
from transformers import BertTokenizer, BertModel

# Setting up the device for GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

# Checking if CUDA is available and getting the GPU device name
cuda_available = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if cuda_available else "No CUDA Device Available"

cuda_available, gpu_name

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

(True, 'Tesla T4')

In [3]:
# Function to create BERT embeddings
def create_embeddings(text, model=model, tokenizer=tokenizer, device='cuda'):
    """
    Generate BERT embeddings for a given text.

    :param text: The input text to generate embeddings for.
    :param model: The BERT model (e.g., a pre-trained BERT model).
    :param tokenizer: The BERT tokenizer.
    :param device: The device (e.g., 'cuda' for GPU or 'cpu') to run the model on.

    :return: A PyTorch Tensor containing the BERT embeddings for the input text.
    """
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512, add_special_tokens=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    model.eval()

    with torch.no_grad():
        outputs = model(**inputs)

    hidden_states = outputs.last_hidden_state
    text_embedding = hidden_states.mean(dim=1).squeeze(0)

    return text_embedding

In [4]:
import nltk
from nltk.tokenize import sent_tokenize
import torch.nn.functional as F

# Main function to preprocess text
def preprocess(text, keyword_embeddings, stop_words, threshold=0.5):
    sentences = sent_tokenize(text)
    filtered_sentences = []

    for sentence in sentences:
        sentence_embedding = create_embeddings(sentence)
        similarities = [F.cosine_similarity(
                                sentence_embedding.unsqueeze(0),
                                keyword_emb.unsqueeze(0), dim=1)
                                for keyword_emb in keyword_embeddings]

        if not any(similarity > threshold for similarity in similarities):
            filtered_sentences.append(sentence)

    processed_text = ' '.join(filtered_sentences)
    return clean_text(processed_text, stop_words)

### **Keyword Extraction**

In [5]:
!pip install keybert
!pip install sentence_transformers

Collecting keybert
  Downloading keybert-0.8.3.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers>=0.3.8->keybert)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: keybert, sentence-transformers
  Building wheel for keybert (setup.py) ... [?25l[?25hdone
  Created wheel for keybert: filename=keybert-0.8.3-py3-none-any.whl size=39126 sha256=9e4ba2c49acd51e2d34f47da42bd18a0315d07082f30eed9a74f16342d1b884b
  Stored in direct

In [6]:
from keybert import KeyBERT
import torch

# Initialize KeyBERT model
kw_model = KeyBERT()

def extract_keywords(text, num_keywords=5):
    """
    Extract keywords from a text using KeyBERT and compute their embeddings.

    :param text: The text to extract keywords from.
    :param num_keywords: Number of keywords to extract.
    :return: A dictionary with keyword embeddings and the list of keywords.
    """

    # Extract keywords from the text
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_keywords, use_mmr=True, diversity=0.7)

    # Extract just the keywords (first element of each tuple)
    extracted_keywords = [keyword[0] for keyword in keywords]

    # Tokenize and encode extracted keywords in a batch
    key_word_tokens = tokenizer(extracted_keywords, padding=True, return_tensors="pt").to(device)

    with torch.no_grad():
        key_word_outputs = model(**key_word_tokens)
    key_word_embeddings = key_word_outputs.last_hidden_state.mean(dim=1)

    return {
        'key_word_embeddings': key_word_embeddings,
        'keywords': extracted_keywords
    }

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## **Text Preprocessing Using Term Frequency - Inverse Document Frequency (TF-IDF) Filtering**

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def preprocess_tf_idf(texts, stop_words, idf_percentile_lower=10, idf_percentile_upper=90):
    """
    Preprocess texts by calculating TF-IDF and filtering words based on TF-IDF percentile thresholds.

    :param texts: List of input texts to be preprocessed.
    :param stop_words: Words to be removed from the text.
    :param idf_percentile_lower: Lower percentile threshold for TF-IDF scores.
    :param idf_percentile_upper: Upper percentile threshold for TF-IDF scores.
    :return: List of preprocessed texts.
    """
    # Tokenize and clean each document
    clean_docs = [clean_text(text, stop_words) for text in texts]  # clean_text should return a string

    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(clean_docs)

    # Get feature names and TF-IDF score
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = np.asarray(tfidf_matrix.sum(axis=0)).flatten()

    # Calculate percentile thresholds
    lower_threshold = np.percentile(tfidf_scores, idf_percentile_lower)
    upper_threshold = np.percentile(tfidf_scores, idf_percentile_upper)

    # Filter words based on TF-IDF scores
    filtered_words = [feature_names[i] for i in range(len(feature_names))
                      if lower_threshold <= tfidf_scores[i] <= upper_threshold]

    # Reconstruct the documents using filtered words
    preprocessed_texts = []
    for doc in clean_docs:
        words = doc.split()
        preprocessed_texts.append(" ".join([word for word in words if word in filtered_words]))

    return preprocessed_texts

## **Text Metrics**

In [8]:
import re
import nltk
from collections import Counter
from nltk.tokenize import sent_tokenize

def text_metrics(text):

    # Overall lexical diversity
    words = word_tokenize(text.lower())
    overall_lexical_diversity = lexical_diversity(words)
    num_words = len(words)

    # Sentence tokenization
    sentences = sent_tokenize(text)

    # Lexical diversity per sentence
    sentence_diversities = []
    for sentence in sentences:
        words_in_sentence = word_tokenize(sentence)
        unique_words = len(set(words_in_sentence))
        total_words = len(words_in_sentence)
        if total_words > 0:
            sentence_diversity = unique_words / total_words
        else:
            sentence_diversity = 0
        sentence_diversities.append(sentence_diversity)

    # Average lexical diversity of sentences
    avg_sentence_lexical_diversity = sum(sentence_diversities) / len(sentence_diversities) if sentence_diversities else 0

    # Top ten most frequent words
    top_ten_words = Counter(words).most_common(10)
    num_sentences = len(sentences)

    return {
        'num_words': num_words,
        'lexical_diversity': overall_lexical_diversity,
        'avg_sentence_lexical_diversity': avg_sentence_lexical_diversity,
        'top_ten_words': top_ten_words,
        'num_sentences': num_sentences
    }

In [9]:
def lexical_diversity(words):
    """
    Calculate the overall lexical diversity of the text.

    :param words: List of all words in the text.
    :return: Lexical diversity, a ratio of unique words to total words.
    """
    return len(set(words)) / len(words) if words else 0

###**Formatting Function for Text Metrics**

In [10]:
def format_metrics(titles, metrics_list):
    for title, metrics in zip(titles, metrics_list):
        if metrics['top_ten_words']:
            formatted_top_words = ', '.join([word for word, _ in metrics['top_ten_words']])
            highest_word, highest_freq = metrics['top_ten_words'][0]  # Extracting the highest frequency word and its frequency
        else:
            formatted_top_words = "None"
            highest_word, highest_freq = ("N/A", 0)

        # Formatting the diversities as percentages
        overall_diversity_percentage = metrics['lexical_diversity'] * 100
        avg_sentence_diversity_percentage = metrics['avg_sentence_lexical_diversity'] * 100

        print(f"--------- Text Metrics for {title} ---------")
        print(f"Total Words: {metrics['num_words']}")
        print(f"Total Sentences: {metrics['num_sentences']}")
        print(f"Overall Lexical Diversity: {overall_diversity_percentage:.2f}%")
        print(f"Average Lexical Diversity of Sentences: {avg_sentence_diversity_percentage:.2f}%")
        print(f"Top Ten Most Frequent Words: {formatted_top_words}")
        print(f"Highest Frequency Word: '{highest_word}' (Frequency: {highest_freq})")
        print()

## **Importing and Reading `TP001.txt`, `TP002.txt` from URL and `austen-emma.txt` from NLTK corpora**

In [11]:
## **Importing and Reading `TP001.txt`, `TP002.txt` from URL and `austen-emma.txt` from NLTK corpora** {display-mode: "form"}

In [12]:
!wget 'https://raw.githubusercontent.com/scskalicky/LING-226-vuw/main/the-current/tp001.txt'
!wget 'https://raw.githubusercontent.com/scskalicky/LING-226-vuw/main/the-current/tp002.txt'

--2023-11-20 03:56:01--  https://raw.githubusercontent.com/scskalicky/LING-226-vuw/main/the-current/tp001.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 220746 (216K) [text/plain]
Saving to: ‘tp001.txt’


2023-11-20 03:56:02 (1.94 MB/s) - ‘tp001.txt’ saved [220746/220746]

--2023-11-20 03:56:02--  https://raw.githubusercontent.com/scskalicky/LING-226-vuw/main/the-current/tp002.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 812713 (794K) [text/plain]
Saving to: ‘tp002.txt’


2023-11-20 03:56:03 (4.24 MB/s) - ‘tp002.txt’ s

In [13]:
# Open the file and read its lines
with open('tp001.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

tp001_text = ""
for line in lines:
    if '\t' in line:
        comment = line.split('\t')[1].strip()  # Extract and strip the comment
        tp001_text += comment + " "  # Add the comment to the text string

print(tp001_text[:200])

... we need to work hard to make it happen 3d is better than other bands in the whole country a ban on sales of new petrol vehicles would be more sensible than an outright ban .  an outright ban is it


In [14]:
# Open the file and read its lines
with open('tp002.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

tp002_text = ""
for line in lines:
    if '\t' in line:
        comment = line.split('\t')[1].strip()  # Extract and strip the comment
        tp002_text += comment + " "  # Add the comment to the text string

print(tp002_text[:200])

I agree with this idea but there also needs to be some system of dealing with bi-catch. I don't want fish caught to be wasted just because those fishing commercially have accidentally caught the wrong


In [15]:
from nltk.corpus import gutenberg

# Downloading gutenberg corpus
nltk.download('gutenberg')

# Using Emma by Jane Austen 1816
emma_text = gutenberg.raw('austen-emma.txt')
print(emma_text[:290])

[nltk_data] Downloading package gutenberg to /root/nltk_data...


[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.



[nltk_data]   Unzipping corpora/gutenberg.zip.


# **Experimentation**
The following experimentation section includes:
- An analysis and overview of metrics from both sample texts.  
- Visualization of the top ten words before and after processing.
- Analysis of Emma's overall lexical diversity before and after processing.

**Notes:** I have chosen to use the NLTK's stopword list for preprocessing.

## **Importing libraries and initializing stopwords set**
Required for preprocessing and visualization.

In [16]:
import nltk
from nltk.corpus import stopwords

# Using stopwords from NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Additional words to the stopwords set
additional_stopwords = {'n', 'mr', '1816', 'mrs', 'miss',  'ä', 'å', 'pä', 'n', 's', 'r', 'tä', 'wä', 'thä', 'bä', 'ahhhhhhhhhhhh', 'hxbdjs', 'log'}
stop_words.update(additional_stopwords)
print(stop_words)

{'the', 'against', 'o', 'when', 'mrs', 'y', 'was', 'she', 'during', 'wouldn', 'couldn', 'yourself', 'its', 'they', 'shan', 'wasn', 'i', 'that', 'and', "couldn't", 'themselves', 'himself', 'doing', 'very', "hasn't", 'below', "don't", 'all', "needn't", 'n', 'weren', 'he', 'above', 'her', 'bä', 'being', 'then', "you'd", 'll', "should've", 'for', 'yourselves', 'can', 'some', "weren't", "shan't", 'here', 'not', 'with', '1816', 'shouldn', 'theirs', 'ours', "it's", 'by', 'miss', 'we', 'mustn', "isn't", 's', "you're", 'ä', 'has', 'ahhhhhhhhhhhh', 'you', 'are', 'didn', 'under', 'hadn', 'isn', 'will', 'log', 'a', 'up', 'them', 'me', 'but', "hadn't", "mightn't", 'him', 'who', 'while', 'does', 'or', 'wä', 'his', 'what', 'after', 'about', 'both', 'to', "won't", 'these', 'å', 'our', 'which', "shouldn't", 'it', 'through', 're', 'hers', 'just', 'do', 'too', 'now', 'until', 'pä', 'been', 'd', 'than', 'each', 'over', 'why', 'this', 'if', "you'll", 'aren', 'ourselves', 'more', 'into', 'did', 'down', 'tho

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Analysis and Extraction of Key Metrics from Sample Texts**


###**Analyzing Raw Texts**

In [17]:
# Get text metrics for raw unprocessed text
emma_metrics = text_metrics(emma_text)
tp001_metrics = text_metrics(tp001_text)
tp002_metrics = text_metrics(tp002_text)

# Prepare titles and metrics for the function
titles = ["Emma - Raw Text", "TP001 - Raw Text", "TP002 - Raw Text"]
metrics_list = [emma_metrics, tp001_metrics, tp002_metrics]

# Using the format_and_graph_multiple_metrics function to display metrics and graphs
format_metrics(titles, metrics_list)

--------- Text Metrics for Emma - Raw Text ---------
Total Words: 191781
Total Sentences: 7493
Overall Lexical Diversity: 4.14%
Average Lexical Diversity of Sentences: 90.89%
Top Ten Most Frequent Words: ,, ., the, to, and, of, i, a, --, it
Highest Frequency Word: ',' (Frequency: 12016)

--------- Text Metrics for TP001 - Raw Text ---------
Total Words: 41094
Total Sentences: 1431
Overall Lexical Diversity: 11.75%
Average Lexical Diversity of Sentences: 88.95%
Top Ten Most Frequent Words: the, to, ., we, and, i, it, a, is, be
Highest Frequency Word: 'the' (Frequency: 1507)

--------- Text Metrics for TP002 - Raw Text ---------
Total Words: 153902
Total Sentences: 4903
Overall Lexical Diversity: 7.63%
Average Lexical Diversity of Sentences: 89.43%
Top Ten Most Frequent Words: the, to, ., fish, we, and, it, i, #, is
Highest Frequency Word: 'the' (Frequency: 5702)



### **Analyzing Texts Processed with Context-Based Filtering**
This section is computationally and resource intensive, it can also take a while to run, typically under 4 minutes on a Tesla T4 in a colab runtime environment. It can be skipped.

In [None]:
# Extracting keywords for preprocessing
num_tp001_keywords = 15 # @param {type:"slider", min:1, max:50, step:1}
tp001_keywords = extract_keywords(tp001_text, num_tp001_keywords)
print(tp001_keywords['keywords'])

num_tp002_keywords = 15 # @param {type:"slider", min:1, max:50, step:1}
tp002_keywords = extract_keywords(tp002_text, num_tp002_keywords)
print(tp002_keywords['keywords'])

num_emma_keywords = 20 # @param {type:"slider", min:1, max:100, step:1}
emma_keywords = extract_keywords(emma_text, num_emma_keywords)
print(emma_keywords['keywords'])

emma_keyword_embeddings = emma_keywords['key_word_embeddings']
tp001_keyword_embeddings = tp001_keywords['key_word_embeddings']
tp002_keyword_embeddings = tp002_keywords['key_word_embeddings']

['renewable alternatives', '2050 maybe', 'technology fix', 'ban today', 'transport freedom', 'change frightening', 'earth begin', 'damaging petrolheads', 'scooter bike', 'maximising profits', 'vote campaign', 'totally agree', 'meterals isnt', 'goelectric allforms', 'carscontribute create', 'cell functional', 'conditions saver', 'children swim', 'dairy intake', 'xtraction processing']
['sustainable fishing', 'vegan supermarkets', 'unsustainability valid', 'term benefits', 'force prices', 'ethical abd', 'everythibg moderation', 'chainge environment', 'sthe means', 'eatting microscopic', 'types ideas', 'rduce recycle', 'especially salmn', 'gore wont', 'mercury levels', 'balance integral', 'setters rest', 'tepapa amazing', 'using requirements', 'shift population', 'fvbvcvvcch vbvg', 'protects eosystems', 'importend save', '6374 676746', 'yee yee']


In [None]:
# Preprocess TP001 text
preprocessed_tp001 = preprocess(tp001_text, tp001_keyword_embeddings, stop_words)
preprocessed_tp001_metrics = text_metrics(preprocessed_tp001)
format_metrics(["TP001 - Preprocessed Text"], [preprocessed_tp001_metrics])

# Preprocess TP002 text
preprocessed_tp002 = preprocess(tp002_text, tp002_keyword_embeddings, stop_words)
preprocessed_tp002_metrics = text_metrics(preprocessed_tp002)
format_metrics(["TP002 - Preprocessed Text"], [preprocessed_tp002_metrics])

# Preprocess Emma text
preprocessed_emma = preprocess(emma_text, emma_keyword_embeddings, stop_words)
preprocessed_emma_metrics = text_metrics(preprocessed_emma)
format_metrics(["Emma - Preprocessed Text"], [preprocessed_emma_metrics])

--------- Text Metrics for TP001 - Preprocessed Text ---------
Total Words: 4379
Total Sentences: 1
Overall Lexical Diversity: 33.09%
Average Lexical Diversity of Sentences: 33.09%
Top Ten Most Frequent Words: need, think, cars, planet, change, petrol, good, better, world, would
Highest Frequency Word: 'need' (Frequency: 87)

--------- Text Metrics for TP002 - Preprocessed Text ---------
Total Words: 11117
Total Sentences: 1
Overall Lexical Diversity: 26.42%
Average Lexical Diversity of Sentences: 26.42%
Top Ten Most Frequent Words: fish, think, dont, need, good, would, people, future, better, eat
Highest Frequency Word: 'fish' (Frequency: 596)



###**Analyzing Preprocessed Texts Using TF-IDF Based Filtering**

In [None]:
# Preprocess the texts using TF-IDF
texts = [tp001_text, tp002_text, emma_text]
preprocessed_texts = preprocess_tf_idf(texts, stop_words)

# Assuming preprocessed_texts is a list with three elements
tf_idf_tp001 = preprocessed_texts[0]
tf_idf_tp002 = preprocessed_texts[1]
tf_idf_emma = preprocessed_texts[2]

# Calculate metrics for each preprocessed text
tf_idf_tp001_metrics = text_metrics(tf_idf_tp001)
tf_idf_tp002_metrics = text_metrics(tf_idf_tp002)
tf_idf_emma_metrics = text_metrics(tf_idf_emma)

# Display formatted metrics
titles_tf_idf = ["TP001 - TF-IDF Preprocessed Text", "TP002 - TF-IDF Preprocessed Text", "Emma - TF-IDF Preprocessed Text"]
metrics_tf_idf = [tf_idf_tp001_metrics, tf_idf_tp002_metrics, tf_idf_emma_metrics]
format_metrics(titles_tf_idf, metrics_tf_idf)

## **Top Ten Words Before and After Processing**

The following visual comparison of word frequencies before and after text processing illustrates the shift from generic to specific language elements, informing the thematic interpretation of the text.


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Extracting top ten words and their frequencies for plotting
tp001_top_ten_words, tp001_frequencies = zip(*tp001_metrics['top_ten_words'])
tp002_top_ten_words, tp002_frequencies = zip(*tp002_metrics['top_ten_words'])
emma_top_ten_words, emma_frequencies = zip(*emma_metrics['top_ten_words'])

preprocessed_tp001_top_ten, preprocessed_tp001_freq = zip(*preprocessed_tp001_metrics['top_ten_words'])
preprocessed_tp002_top_ten, preprocessed_tp002_freq = zip(*preprocessed_tp002_metrics['top_ten_words'])
preprocessed_emma_top_ten, preprocessed_emma_freq = zip(*preprocessed_emma_metrics['top_ten_words'])

tf_idf_tp001_top_ten, tf_idf_tp001_freq = zip(*tf_idf_tp001_metrics['top_ten_words'])
tf_idf_tp002_top_ten, tf_idf_tp002_freq = zip(*tf_idf_tp002_metrics['top_ten_words'])
tf_idf_emma_top_ten, tf_idf_emma_freq = zip(*tf_idf_emma_metrics['top_ten_words'])

# Create a subplot figure with 3 rows and 3 columns (transposed)
fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=(
        'TP001: Ban Petrol Cars by 2030',
        'TP002: Sustainable Fish Sales',
        'Emma by Jane Austen',
        'Context Based Preprocessed TP001',
        'Context Based Preprocessed TP002',
        'Context Based Preprocessed Emma',
        'TF-IDF Preprocessed TP001',
        'TF-IDF Preprocessed TP002',
        'TF-IDF Preprocessed Emma')
)

# Original TP001
fig.add_trace(
    go.Bar(x=tp001_top_ten_words, y=tp001_frequencies),
    row=1, col=1
)

# Original TP002
fig.add_trace(
    go.Bar(x=tp002_top_ten_words, y=tp002_frequencies),
    row=1, col=2
)

# Original Emma
fig.add_trace(
    go.Bar(x=emma_top_ten_words, y=emma_frequencies),
    row=1, col=3
)

# Preprocessed TP001
fig.add_trace(
    go.Bar(x=preprocessed_tp001_top_ten, y=preprocessed_tp001_freq),
    row=2, col=1
)

# Preprocessed TP002
fig.add_trace(
    go.Bar(x=preprocessed_tp002_top_ten, y=preprocessed_tp002_freq),
    row=2, col=2
)

# Preprocessed Emma
fig.add_trace(
    go.Bar(x=preprocessed_emma_top_ten, y=preprocessed_emma_freq),
    row=2, col=3
)

# TF-IDF TP001
fig.add_trace(
    go.Bar(x=tf_idf_tp001_top_ten, y=tf_idf_tp001_freq),
    row=3, col=1
)

# TF-IDF TP002
fig.add_trace(
    go.Bar(x=tf_idf_tp002_top_ten, y=tf_idf_tp002_freq),
    row=3, col=2
)

# TF-IDF Emma
fig.add_trace(
    go.Bar(x=tf_idf_emma_top_ten, y=tf_idf_emma_freq),
    row=3, col=3
)

# Update layout
fig.update_layout(
    title_text='Top Ten Words and Their Frequencies',
    showlegend=False,
    height=970
)

# Customize axis labels
for row in range(1, 4):
    for col in range(1, 4):
        fig.update_xaxes(title_text='Words', row=row, col=col)

fig.update_yaxes(title_text='Frequency', row=1, col=1)
fig.update_yaxes(title_text='Frequency', row=2, col=1)
fig.update_yaxes(title_text='Frequency', row=3, col=1)

# Show the figure
fig.show()

## **Comparative Analysis of Overall Lexical Diversity in Processed and Unprocessed Versions of Emma**

The results of the analysis below show the overall lexical diversity of Jane Austen's "Emma" in both its processed and unprocessed forms as the batch size (number of sentences per batch) increases. It demonstrates how lexical diversity decreases as the size of the text increases.

**Processed Overall Lexical Diversity (Blue):** As the batch size increases, we observe a gradual decrease in lexical diversity for the processed version. This trend indicates that when analyzing larger portions of the text together, the processed version becomes less lexically diverse. This is attributed to the removal of high-frequency and stop words during processing, which results in a more focused vocabulary.

**Unprocessed Overall Lexical Diversity (Red):** The red line represents the lexical diversity of the original, unprocessed text. In this case, as the batch size increases, we see a sharp decrease in lexical diversity of the unprocessed text. This decrease suggests that even in the unprocessed text, certain words become more prominent and repetitive when analyzing larger sections of the text.

In summary, overall lexical diversity highlights how text processing affects the diversity of words in a text, particularly as the of a text scale increases. While both processed and unprocessed texts exhibit a decrease in lexical diversity with larger batch sizes, the unprocessed version tends to show a more pronounced decrease due to the inherent diversity in this large text.

In [None]:
increment = 75  # n sentences per increment
emma_num_sentences = emma_metrics['num_sentences']
batch_sizes = list(range(1, emma_num_sentences, increment))  # Incrementally increase batch size

ld_unprocessed = []
ld_processed = []

# Split the text into sentences
sentences = sent_tokenize(emma_text)

# Calculate lexical diversities
for batch_size in batch_sizes:
    unprocessed = ' '.join(sentences[:batch_size])
    processed = clean_text(unprocessed, stop_words)

    # Extract words and calculate lexical diversity
    unprocessed_words = re.findall(r'\b\w+\b', unprocessed.lower())
    processed_words = re.findall(r'\b\w+\b', processed.lower())

    unprocessed_diversity = lexical_diversity(unprocessed_words)
    processed_diversity = lexical_diversity(processed_words)

    ld_unprocessed.append(unprocessed_diversity)
    ld_processed.append(processed_diversity)

In [None]:
# Prepare batch size labels with sentence count
batch_size_labels = [batch_size for batch_size in batch_sizes]

# Convert lexical diversity to percentages
processed_lex_div = [ld * 100 for ld in ld_processed]
unprocessed_lex_div = [ld * 100 for ld in ld_unprocessed]

# Create traces
trace1 = go.Scatter(
    x=batch_size_labels,
    y=processed_lex_div,
    mode='lines',
    name='Overall Lexical Diversity (Processed)',
)
trace2 = go.Scatter(
    x=batch_size_labels,
    y=unprocessed_lex_div,
    mode='lines',
    name='Overall Lexical Diversity (Unprocessed)',
)

# Layout
layout = go.Layout(
    title='Overall Lexical Diversity over Increments of ' + str(increment) + ' Sentences',
    xaxis=dict(title='Number of Sentences'),
    yaxis=dict(title='Lexical Diversity (%)'),
)

# Figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

# Show plot
fig.show()