<a href="https://colab.research.google.com/github/R-802/LING-226-Assignments/blob/main/Assignment_One.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#LING226 2023 T3 Assignment One
- Shemaiah Rangitaawa `300601546`
- Attempting Challenge

## **Text Preprocessing**
Rather than removing terms by frequency, I have decided to remove text based on it's embedding similarity.


In [1]:
import torch
from transformers import BertTokenizer, BertModel

# Setting up the device for GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

# Checking if CUDA is available and getting the GPU device name
cuda_available = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if cuda_available else "No CUDA Device Available"

cuda_available, gpu_name

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

(True, 'Tesla T4')

In [2]:
# Function to create BERT embeddings
def create_embeddings(text):
    # Add special tokens ([CLS] and [SEP]) and tokenize the text
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)

    # Convert tokens to their corresponding IDs
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create segment IDs (0 for the first sentence, 1 for the second sentence)
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors and move them to the correct device
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    # Put the model in "evaluation" mode
    model.eval()

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

    # Get the hidden states from the outputs
    hidden_states = outputs.last_hidden_state

    # Use the [CLS] token representation as the sentence embedding
    sentence_embedding = hidden_states[0].mean(dim=0)

    # Convert the sentence embedding to a NumPy array while still on the GPU
    sentence_embedding = sentence_embedding.cpu().numpy()

    return sentence_embedding

In [48]:
import string

# Function to preprocess text: lowercase, remove punctuation, and stop words
def preprocess_text(text, stop_words):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])

In [42]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

# Main function to refine text
def refine_text(text, keywords, stop_words, threshold=0.5):
    sentences = sent_tokenize(text)
    keyword_embeddings = [create_embeddings(keyword) for keyword in keywords]
    filtered_sentences = []

    for sentence in sentences:
        sentence_embedding = create_embeddings(sentence)
        if not any(cosine_similarity(sentence_embedding.reshape(1, -1),
                                     keyword_emb.reshape(1, -1))[0][0] > threshold
                   for keyword_emb in keyword_embeddings):
            filtered_sentences.append(sentence)

    processed_text = ' '.join(filtered_sentences)
    return preprocess_text(processed_text, stop_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
# Example usage of refine_text
text = """
Technology is rapidly advancing, bringing new innovations every day. The quick brown fox jumps over the lazy dog.
Artificial intelligence is transforming industries. Yesterday, I had a delicious apple pie.
Renewable energy sources, like solar and wind power, are essential for sustainable growth. Cats are wonderful pets.
"""

# Keywords related to 'technology'
keywords = ["technology", "artificial intelligence", "renewable energy"]

# Print the refined text
model.to(device)
print(refine_text(text, keywords, set([]), 0.6))

technology is rapidly advancing bringing new innovations every day the quick brown fox jumps over the lazy dog yesterday i had a delicious apple pie cats are wonderful pets


In [None]:
!pip install keybert
!pip install sentence_transformers

In [49]:
from keybert import KeyBERT
import torch

# Initialize KeyBERT model
kw_model = KeyBERT()

def extract_keywords(text, num_keywords=5):
    """
    Extract keywords from a text using KeyBERT and compute their embeddings.

    :param text: The text to extract keywords from.
    :param num_keywords: Number of keywords to extract.
    :return: A dictionary with keyword embeddings and the list of keywords.
    """

    # Extract keywords from the text
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=num_keywords)

    # Extract just the keywords (first element of each tuple)
    extracted_keywords = [keyword[0] for keyword in keywords]

    # Tokenize and encode extracted keywords in a batch
    key_word_tokens = tokenizer(extracted_keywords, padding=True, return_tensors="pt").to(device)

    with torch.no_grad():
        key_word_outputs = model(**key_word_tokens)
    key_word_embeddings = key_word_outputs.last_hidden_state.mean(dim=1)

    return {
        'key_word_embeddings': key_word_embeddings,
        'keywords': extracted_keywords
    }

## **Text Metric Function `text_metrics`**
**Total Words:** The total count of words in the text.

**Overall Lexical Diversity:** The ratio of unique words to the total number of words, providing a measure of the text's vocabulary variety.

**Average Sentence Lexical Diversity:** The average diversity of vocabulary used across all sentences in the text.

**Top Ten Most Frequent Word:** A list of the ten most commonly used words in the text, along with their frequencies.

**Total Number of Sentences:** The total sentence count of the text. When analysing processed text, this metric becomes redundant as there is no punctuation to split the text on.

In [18]:
import re
from collections import Counter

def text_metrics(text):
    """
    Calculate various metrics from the given text.

    :param text: The input text to analyze
    :return: A dictionary with metrics including total words, overall lexical diversity,
             average sentence lexical diversity, top ten words, and number of sentences
    """
    # Tokenizing the text into words
    words = re.findall(r'\b\w+\b', text.lower())
    total_words = len(words)

    # Overall lexical diversity
    unique_words = len(set(words))
    overall_lexical_diversity = unique_words / total_words if total_words > 0 else 0

    # Sentence tokenization
    sentences = re.split(r'[.!?]+\s*', text.strip())
    sentences = [s for s in sentences if s]  # Remove empty sentences

    # Lexical diversity per sentence
    sentence_diversities = [
        len(set(re.findall(r'\b\w+\b', sentence.lower()))) / len(sentence.split())
        for sentence in sentences
    ]

    # Average lexical diversity of sentences
    avg_sentence_lexical_diversity = sum(sentence_diversities) / len(sentence_diversities) if sentence_diversities else 0

    # Top ten most frequent words
    top_ten_words = Counter(words).most_common(10)

    # Number of sentences
    num_sentences = len(sentences)

    return {
        'total_words': total_words,
        'overall_lexical_diversity': overall_lexical_diversity,
        'avg_sentence_lexical_diversity': avg_sentence_lexical_diversity,
        'top_ten_words': top_ten_words,
        'num_sentences': num_sentences
    }

###**Formatting for Text Metrics**

In [19]:
def format_metrics(title, metrics):
    # Handling edge case where there might not be any frequent words
    if metrics['top_ten_words']:
        formatted_top_words = ', '.join([word for word, _ in metrics['top_ten_words']])
        highest_word, highest_freq = metrics['top_ten_words'][0]  # Extracting the highest frequency word and its frequency
    else:
        formatted_top_words = "None"
        highest_word, highest_freq = ("N/A", 0)

    # Formatting the diversities as percentages
    overall_diversity_percentage = metrics['overall_lexical_diversity'] * 100
    avg_sentence_diversity_percentage = metrics['avg_sentence_lexical_diversity'] * 100

    return (f"--------- Text Metrics for {title} ---------\n"
            f"Total Words: {metrics['total_words']}\n"
            f"Total Sentences: {metrics['num_sentences']}\n"
            f"Overall Lexical Diversity: {overall_diversity_percentage:.2f}%\n"
            f"Average Lexical Diversity of Sentences: {avg_sentence_diversity_percentage:.2f}%\n"
            f"Top Ten Most Frequent Words: {formatted_top_words}\n"
            f"Highest Frequency Word: '{highest_word}' (Frequency: {highest_freq})\n")


## **Importing and Reading `TP001.txt` from URL and `austen-emma.txt` from NLTK corpora**

In [20]:
!wget 'https://raw.githubusercontent.com/scskalicky/LING-226-vuw/main/the-current/tp001.txt'

--2023-11-18 09:42:07--  https://raw.githubusercontent.com/scskalicky/LING-226-vuw/main/the-current/tp001.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 220746 (216K) [text/plain]
Saving to: ‘tp001.txt’


2023-11-18 09:42:07 (6.19 MB/s) - ‘tp001.txt’ saved [220746/220746]



In [21]:
# Open the file and read its lines
with open('tp001.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Concatenate all comments into a single text string
tp001_text = ""
for line in lines:
    if '\t' in line:
        comment = line.split('\t')[1].strip()  # Extract and strip the comment
        tp001_text += comment + " "  # Add the comment to the text string

# Optionally, display the first part of the concatenated text
print("First part of tp001_text:", tp001_text[:500])

First part of tp001_text: ... we need to work hard to make it happen 3d is better than other bands in the whole country a ban on sales of new petrol vehicles would be more sensible than an outright ban .  an outright ban is itself wasteful A carless life is much more fun A good idea in theory but would have to change a lot of infrastructure. Not to mention industry and jobs. a good idea to protect our earth ! A good opportunity to reduce harm to the environment A N G E R Y A s part of many other changes A STEP IN THE RIG


In [22]:
import nltk
from nltk.corpus import gutenberg

# Download gutenberg corpus
nltk.download('gutenberg')

# Using Emma by Jane Austen 1816
emma_text = gutenberg.raw('austen-emma.txt')
print(emma_text[:290])

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.



# **Experimentation**
The following experimentation section includes:
- An analysis and overview of metrics from both sample texts.  
- Visualization of the top ten words before and after processing.
- Analysis of Emma's overall lexical diversity before and after processing.

**Notes:** I have chosen to use the NLTK's stopword list for preprocessing. I  have used 'Emma by Jane Austen 1816' from NLTK corpora and 'TP001 (Petrol cars should be banned by 2030)' from The Current.

### **Importing libraries and initializing stopwords set**
Required for preprocessing and visualization.

In [56]:
import random
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# For some reason 'n' was the top word in tp001
stop_words.add('n')
print(stop_words)

{'an', 'did', 'is', 'off', 'my', 'yourselves', 'out', 'hadn', 'can', 'now', 'hers', 'when', 'further', 'i', 'this', 'our', 'has', 'until', 'if', 'won', "mightn't", 'of', 'up', 'isn', 'mustn', 'nor', 'what', 'was', 'his', "you'd", 'm', 'herself', 'shouldn', 'only', 'between', 'yourself', 'that', 'so', 'have', 'which', "she's", 'were', 'being', 'all', 'these', "weren't", "isn't", 'more', 'at', 'from', 'll', "wasn't", 'wouldn', 'its', "shouldn't", 'she', 'ours', 'be', 'didn', 'he', "don't", 'where', "you'll", 'been', "hasn't", 'are', 'theirs', 'a', "aren't", 'needn', 'because', 'same', 'those', 'below', 'on', "needn't", 'themselves', 'again', 'some', "couldn't", 'you', 'does', 'into', 'just', 'will', 'against', 'too', 'during', 'ain', 'than', 'while', "hadn't", 'whom', 'it', 'their', 'under', "it's", 'by', 'then', 'there', 'doesn', 'd', 'don', 'me', 'they', 'doing', 'ourselves', 'for', 'having', 'with', 'each', 'or', 'her', 'over', "you're", "you've", 'as', 'yours', 'itself', 'how', 'migh

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Analysis and overview of metrics from both sample texts**


In [44]:
# Analyzing raw texts
# --------------------

# Get text metrics for raw unprocessed text
emma_metrics = text_metrics(emma_text)
tp001_metrics = text_metrics(tp001_text)

# Extracting top ten words and their frequencies for plotting
emma_top_ten_words, emma_frequencies = zip(*emma_metrics['top_ten_words'])
tp001_top_ten_words, tp001_frequencies = zip(*tp001_metrics['top_ten_words'])

# Extract the number of sentences
emma_num_sentences = emma_metrics['num_sentences']
tp001_num_sentences = tp001_metrics['num_sentences']

# Printing metrics for raw texts
print(format_metrics("Emma - Raw Text", emma_metrics))
print(format_metrics("TP001 - Raw Text", tp001_metrics))

--------- Text Metrics for Emma - Raw Text ---------
Total Words: 161983
Total Sentences: 10535
Overall Lexical Diversity: 4.48%
Average Lexical Diversity of Sentences: 93.33%
Top Ten Most Frequent Words: to, the, and, of, i, a, it, her, was, she
Highest Frequency Word: 'to' (Frequency: 5239)

--------- Text Metrics for TP001 - Raw Text ---------
Total Words: 39065
Total Sentences: 1562
Overall Lexical Diversity: 12.06%
Average Lexical Diversity of Sentences: 90.14%
Top Ten Most Frequent Words: the, to, we, and, i, it, a, is, be, for
Highest Frequency Word: 'the' (Frequency: 1507)



In [51]:
# Extracting keywords for preprocessing
print("\033[93mExtracting Keywords...\033[0m")

emma_keywords = extract_keywords(emma_text, 20)
print(emma_keywords['keywords'])

tp001_keywords = extract_keywords(tp001_text, 20)
print(tp001_keywords['keywords'])

emma_keyword_embeddings = emma_keywords['key_word_embeddings']
tp001_keyword_embeddings = tp001_keywords['key_word_embeddings']

print("\033[92mExtraction Successful\n\033[0m")

[93mExtracting Keywords...[0m
['emma', 'austen', 'maids', '1816', 'mistress', 'niece', 'isabella', '_taylor_', 'catherine', 'scarlet', 'taylor', 'mistresses', 'jane', 'maid', 'daughters', 'elizabeth', 'harriet', 'housemaid', 'novelties', 'anne']
['renewables', 'renewable', 'mitigation', 'fuels', 'sustainability', 'alternatives', 'sustainable', 'emissions', 'fuelsand', 'energytech', 'environmentally', 'pollution', 'enviromentforfuture', 'biofuel', 'banning', 'pollutionand', 'conservation', 'diesel', 'ennvironmental', 'petroleum']
[92mExtraction Successful
[0m


In [55]:
# Analyzing preprocessed texts
# ----------------------------

# def refine_text(text, keywords, stop_words, threshold=0.5):

preprocessed_tp001 = refine_text(tp001_text, keywords, stop_words)
preprocessed_tp001_metrics = text_metrics(preprocessed_tp001)
print(format_metrics("TP001 - Preprocessed Text", preprocessed_tp001_metrics))

preprocessed_emma = refine_text(emma_text, keywords, stop_words)
preprocessed_emma_metrics = text_metrics(preprocessed_emma)
print(format_metrics("Emma - Preprocessed Text", preprocessed_emma_metrics))

# Extracting top ten words and their frequencies for preprocessed texts
preprocessed_emma_top_ten, preprocessed_emma_freq = zip(*preprocessed_emma_metrics['top_ten_words'])
preprocessed_tp001_top_ten, preprocessed_tp001_freq = zip(*preprocessed_tp001_metrics['top_ten_words'])

# Extract the number of sentences for preprocessed texts
preprocessed_emma_num_sentences = preprocessed_emma_metrics['num_sentences']
preprocessed_tp001_num_sentences = preprocessed_tp001_metrics['num_sentences']

--------- Text Metrics for TP001 - Preprocessed Text ---------
Total Words: 19567
Total Sentences: 1
Overall Lexical Diversity: 23.08%
Average Lexical Diversity of Sentences: 23.26%
Top Ten Most Frequent Words: cars, petrol, need, think, change, good, planet, better, electric, environment
Highest Frequency Word: 'cars' (Frequency: 507)

--------- Text Metrics for Emma - Preprocessed Text ---------
Total Words: 72433
Total Sentences: 1
Overall Lexical Diversity: 12.62%
Average Lexical Diversity of Sentences: 12.62%
Top Ten Most Frequent Words: mr, could, would, emma, mrs, miss, must, much, said, one
Highest Frequency Word: 'mr' (Frequency: 1120)



## **Visualization of The Top Ten Words with Their Frequencies Before and After Processing**

The visual comparison of word frequencies before and after text processing illustrates the shift from generic to specific language elements, informing the thematic interpretation of the text.


In [None]:
# Create a subplot figure with 2 rows and 2 columns
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Unprocessed Emma', 'Unprocessed TP001', 'Processed Emma', 'Processed TP001')
)

# Original Emma
fig.add_trace(
    go.Bar(x=emma_top_ten_words, y=emma_frequencies),
    row=1, col=1
)

# Original TP001
fig.add_trace(
    go.Bar(x=tp001_top_ten_words, y=tp001_frequencies),
    row=1, col=2
)

# Preprocessed Emma
fig.add_trace(
    go.Bar(x=preprocessed_emma_top_ten, y=preprocessed_emma_freq),
    row=2, col=1
)

# Preprocessed TP001
fig.add_trace(
    go.Bar(x=preprocessed_tp001_top_ten, y=preprocessed_tp001_freq),
    row=2, col=2
)

# Update layout
fig.update_layout(
    title_text='Top Ten Words and Their Frequencies',
    showlegend=False,
    height=800, width=1200
)

# Customize axis labels
fig.update_xaxes(title_text='Words', row=1, col=1)
fig.update_xaxes(title_text='Words', row=1, col=2)
fig.update_xaxes(title_text='Words', row=2, col=1)
fig.update_xaxes(title_text='Words', row=2, col=2)
fig.update_yaxes(title_text='Occurrence Frequency', col=1)

# Show the figure
fig.show()

## **Comparative Analysis of Overall Lexical Diversity in Processed and Unprocessed Versions of "Emma" by Jane Austen.**

The results of the analysis below show the overall lexical diversity of Jane Austen's "Emma" in both its processed and unprocessed forms as the batch size (number of sentences per batch) increases. It demonstrates how lexical diversity decreases as the size of the text increases.

**Processed Overall Lexical Diversity (Blue):** As the batch size increases, we observe a gradual decrease in lexical diversity for the processed version. This trend indicates that when analyzing larger portions of the text together, the processed version becomes less lexically diverse. This may be attributed to the removal of high-frequency words during processing, which results in a more focused vocabulary.

**Unprocessed Overall Lexical Diversity (Red):** The red line represents the lexical diversity of the original, unprocessed text. Similarly, as the batch size increases, we also see a decrease in lexical diversity for the unprocessed version. This decrease suggests that even in the unprocessed text, certain words become more prominent and repetitive when analyzing larger sections of the text. However, the unprocessed text tends to maintain a higher lexical diversity compared to the processed version, as it retains all words without filtering.

In summary, the overall lexical diversity highlights how text processing affects the diversity of words in a text, particularly as the scale increases. While both processed and unprocessed texts exhibit a decrease in lexical diversity with larger batch sizes, the processed version tends to show a more pronounced decrease due to the removal of high-frequency words. Understanding these trends can aid in choosing the appropriate preprocessing strategy based on the specific goals of text analysis.

**Note:** Due to the nature of the task, this section can take a while to run (typically < 2 minutes).

In [None]:
increment = 75  # n sentences per increment
batch_sizes = list(range(1, emma_num_sentences, increment))  # Incrementally increase batch size

overall_lex_div_unprocessed = []
overall_lex_div_processed = []

# Split the text into sentences
sentences = sent_tokenize(emma_text)

# Calculate lexical diversities
for batch_size in batch_sizes:
    # Concatenate all sentences up to the current batch size
    concatenated_unprocessed = ' '.join(sentences[:batch_size])
    concatenated_processed = preprocess_text(concatenated_unprocessed, stop_words, std_dev_margin)[0]

    # Calculate overall lexical diversity for the concatenated text
    overall_lex_div_unprocessed.append(text_metrics(concatenated_unprocessed)[1])
    overall_lex_div_processed.append(text_metrics(concatenated_processed)[1])

In [None]:
# Prepare batch size labels with sentence count
batch_size_labels = [batch_size for batch_size in batch_sizes]

# Convert lexical diversity to percentages
processed_lex_div = [ld * 100 for ld in overall_lex_div_processed]
unprocessed_lex_div = [ld * 100 for ld in overall_lex_div_unprocessed]

# Create traces
trace1 = go.Scatter(
    x=batch_size_labels,
    y=processed_lex_div,
    mode='lines',
    name='Overall Lexical Diversity (Processed)',
)
trace2 = go.Scatter(
    x=batch_size_labels,
    y=unprocessed_lex_div,
    mode='lines',
    name='Overall Lexical Diversity (Unprocessed)',
)

# Layout
layout = go.Layout(
    title='Overall Lexical Diversity over Size Increments of ' + str(increment),
    xaxis=dict(title='Number of Sentences'),
    yaxis=dict(title='Lexical Diversity (%)'),
)

# Figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

# Show plot
fig.show()

# Print metrics (assuming format_metrics is defined)
print("\n" + format_metrics("Emma (Unprocessed)", emma_metrics) + "\n")
print(format_metrics("Emma (Processed)", preprocessed_emma_metrics) + "\n")



--------- Text Metrics for Emma (Unprocessed) ---------
Total Words: 161983
Total Sentences: 10567
Overall Lexical Diversity: 4.48%
Average Lexical Diversity of Sentences: 94.43%
Top Ten Most Frequent Words: to, the, and, of, i, a, it, her, was, she
Highest Frequency Word: 'to' (Frequency: 5239)

--------- Text Metrics for Emma (Processed) ---------
Total Words: 71031
Total Sentences: 1
Overall Lexical Diversity: 13.14%
Average Lexical Diversity of Sentences: 13.14%
Top Ten Most Frequent Words: would, emma, mrs, miss, must, much, said, every, one, harriet
Highest Frequency Word: 'would' (Frequency: 816)



## Visualization of the Statistical Approach to Word Removal