<a href="https://colab.research.google.com/github/R-802/LING-226-Assignments/blob/main/Assignment_One.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#LING226 2023 T3 Assignment One
- Shemaiah Rangitaawa `300601546`
- Attempting Challenge

**Note:** Please ensure you are in a GPU runtime environment.

## **Text Preprocessing**
Rather than removing terms by frequency, I have decided to remove text based on embedding similarity.


In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Setting up the device for GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

# Checking if CUDA is available and getting the GPU device name
cuda_available = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if cuda_available else "No CUDA Device Available"

cuda_available, gpu_name

(True, 'Tesla T4')

In [None]:
# Function to create BERT embeddings
def create_embeddings(text, model=model, tokenizer=tokenizer, device='cuda'):
    """
    Generate BERT embeddings for a given text.

    :param text: The input text to generate embeddings for.
    :param model: The BERT model (e.g., a pre-trained BERT model).
    :param tokenizer: The BERT tokenizer.
    :param device: The device (e.g., 'cuda' for GPU or 'cpu') to run the model on.

    :return: A PyTorch Tensor containing the BERT embeddings for the input text.
    """
    # Tokenize the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512, add_special_tokens=True)

    # Move inputs to the specified device
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Put the model in "evaluation" mode
    model.eval()

    with torch.no_grad():
        # Get the model outputs
        outputs = model(**inputs)

    # Get the hidden states from the outputs
    hidden_states = outputs.last_hidden_state

    # Use the [CLS] token representation as the text embedding
    text_embedding = hidden_states.mean(dim=1).squeeze(0)

    return text_embedding

Preprocess text by converting it to lowercase, removing punctuation, and filtering out stop words.

In [None]:
import string

# Function to preprocess text: lowercase, remove punctuation, and stop words
def preprocess_text(text, stop_words):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])

In [None]:
from nltk.tokenize import sent_tokenize
import torch.nn.functional as F

nltk.download('punkt')

# Main function to refine text
def refine_text(text, keyword_embeddings, stop_words, threshold=0.5):
    sentences = sent_tokenize(text)
    filtered_sentences = []

    for sentence in sentences:
        sentence_embedding = create_embeddings(sentence)
        similarities = [F.cosine_similarity(sentence_embedding.unsqueeze(0), keyword_emb.unsqueeze(0), dim=1) for keyword_emb in keyword_embeddings]
        if not any(similarity > threshold for similarity in similarities):
            filtered_sentences.append(sentence)

    processed_text = ' '.join(filtered_sentences)
    return preprocess_text(processed_text, stop_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## **Keyword Extraction**

In [None]:
!pip install keybert
!pip install sentence_transformers



In [None]:
from keybert import KeyBERT
import torch

# Initialize KeyBERT model
kw_model = KeyBERT()

def extract_keywords(text, num_keywords=5):
    """
    Extract keywords from a text using KeyBERT and compute their embeddings.

    :param text: The text to extract keywords from.
    :param num_keywords: Number of keywords to extract.
    :return: A dictionary with keyword embeddings and the list of keywords.
    """

    # Extract keywords from the text
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=num_keywords, use_mmr=True, diversity=0.7)

    # Extract just the keywords (first element of each tuple)
    extracted_keywords = [keyword[0] for keyword in keywords]

    # Tokenize and encode extracted keywords in a batch
    key_word_tokens = tokenizer(extracted_keywords, padding=True, return_tensors="pt").to(device)

    with torch.no_grad():
        key_word_outputs = model(**key_word_tokens)
    key_word_embeddings = key_word_outputs.last_hidden_state.mean(dim=1)

    return {
        'key_word_embeddings': key_word_embeddings,
        'keywords': extracted_keywords
    }

## **Text Metrics**
**Total Words:** The total count of words in the text.

**Overall Lexical Diversity:** The ratio of unique words to the total number of words, providing a measure of the text's vocabulary variety.

**Average Sentence Lexical Diversity:** The average diversity of vocabulary used across all sentences in the text.

**Top Ten Most Frequent Word:** A list of the ten most commonly used words in the text, along with their frequencies.

**Total Number of Sentences:** The total sentence count of the text. When analysing processed text, this metric becomes redundant as there is no punctuation to split the text on.

In [None]:
import re
from collections import Counter
from nltk.tokenize import word_tokenize

def text_metrics(text):
    """
    Calculate various metrics from the given text.

    :param text: The input text to analyze
    :return: A dictionary with metrics including total words, overall lexical diversity,
             average sentence lexical diversity, top ten words, and number of sentences
    """

    # Overall lexical diversity
    words = re.findall(r'\b\w+\b', text.lower())
    lexical_diversity = overall_lexical_diversity(words)
    num_words = len(words)

    # Sentence tokenization
    sentences = sent_tokenize(text)

    # Lexical diversity per sentence
    sentence_diversities = []
    for sentence in sentences:
        words_in_sentence = word_tokenize(sentence)
        unique_words = len(set(words_in_sentence))
        total_words = len(words_in_sentence)

        if total_words > 0:
            sentence_diversity = unique_words / total_words
        else:
            sentence_diversity = 0  # or continue to skip the sentence

        sentence_diversities.append(sentence_diversity)

    # Average lexical diversity of sentences
    avg_sentence_lexical_diversity = sum(sentence_diversities) / len(sentence_diversities) if sentence_diversities else 0

    # Top ten most frequent words
    top_ten_words = Counter(words).most_common(10)

    # Number of sentences
    num_sentences = len(sentences)

    return {
        'num_words': num_words,
        'lexical_diversity': lexical_diversity,
        'avg_sentence_lexical_diversity': avg_sentence_lexical_diversity,
        'top_ten_words': top_ten_words,
        'num_sentences': num_sentences
    }

In [None]:
def overall_lexical_diversity(words):
    """
    Calculate the overall lexical diversity of the text.

    :param words: List of all words in the text.
    :return: Lexical diversity, a ratio of unique words to total words.
    """
    num_words = len(words)
    unique_words = len(set(words))
    return unique_words / num_words if num_words > 0 else 0

###**Formatting for Text Metrics**

In [None]:
def format_metrics(title, metrics):
    # Handling edge case where there might not be any frequent words
    if metrics['top_ten_words']:
        formatted_top_words = ', '.join([word for word, _ in metrics['top_ten_words']])
        highest_word, highest_freq = metrics['top_ten_words'][0]  # Extracting the highest frequency word and its frequency
    else:
        formatted_top_words = "None"
        highest_word, highest_freq = ("N/A", 0)

    # Formatting the diversities as percentages
    overall_diversity_percentage = metrics['lexical_diversity'] * 100
    avg_sentence_diversity_percentage = metrics['avg_sentence_lexical_diversity'] * 100

    return (f"--------- Text Metrics for {title} ---------\n"
            f"Total Words: {metrics['num_words']}\n"
            f"Total Sentences: {metrics['num_sentences']}\n"
            f"Overall Lexical Diversity: {overall_diversity_percentage:.2f}%\n"
            f"Average Lexical Diversity of Sentences: {avg_sentence_diversity_percentage:.2f}%\n"
            f"Top Ten Most Frequent Words: {formatted_top_words}\n"
            f"Highest Frequency Word: '{highest_word}' (Frequency: {highest_freq})\n")

## **Importing and Reading `TP001.txt` from URL and `austen-emma.txt` from NLTK corpora**

In [None]:
!wget 'https://raw.githubusercontent.com/scskalicky/LING-226-vuw/main/the-current/tp001.txt'

--2023-11-18 12:07:05--  https://raw.githubusercontent.com/scskalicky/LING-226-vuw/main/the-current/tp001.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 220746 (216K) [text/plain]
Saving to: ‘tp001.txt.2’


2023-11-18 12:07:06 (6.40 MB/s) - ‘tp001.txt.2’ saved [220746/220746]



In [None]:
# Open the file and read its lines
with open('tp001.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

tp001_text = ""
for line in lines:
    if '\t' in line:
        comment = line.split('\t')[1].strip()  # Extract and strip the comment
        tp001_text += comment + " "  # Add the comment to the text string

tp001_text[:100]

'... we need to work hard to make it happen 3d is better than other bands in the whole country a ban '

In [None]:
import nltk
from nltk.corpus import gutenberg

# Downloading gutenberg corpus
nltk.download('gutenberg')

# Using Emma by Jane Austen 1816
emma_text = gutenberg.raw('austen-emma.txt')
print(emma_text[:290])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.



[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


# **Experimentation**
The following experimentation section includes:
- An analysis and overview of metrics from both sample texts.  
- Visualization of the top ten words before and after processing.
- Analysis of Emma's overall lexical diversity before and after processing.

**Notes:** I have chosen to use the NLTK's stopword list for preprocessing. I  have used 'Emma by Jane Austen 1816' from NLTK corpora and 'TP001 (Petrol cars should be banned by 2030)' from The Current.

### **Importing libraries and initializing stopwords set**
Required for preprocessing and visualization.

In [None]:
import nltk
from nltk.corpus import stopwords

# Download stopwords from NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Additional words to the stopwords set
additional_stopwords = {'n', 'mr', '1816', 'mrs', 'miss'}
stop_words.update(additional_stopwords)
print(stop_words)

{'an', 'did', 'is', 'off', 'my', 'yourselves', 'out', 'hadn', 'can', 'now', 'hers', 'when', 'further', 'i', 'this', 'our', 'has', 'until', 'if', 'won', "mightn't", 'of', 'up', 'isn', 'mustn', 'nor', 'what', 'was', 'his', "you'd", 'm', 'herself', 'shouldn', 'only', 'between', 'yourself', 'that', 'so', 'have', 'which', "she's", 'were', 'being', 'all', 'these', "weren't", "isn't", 'more', 'at', 'from', 'll', "wasn't", 'wouldn', 'its', "shouldn't", 'she', 'ours', 'be', 'didn', 'he', "don't", 'where', "you'll", 'been', "hasn't", 'are', 'theirs', 'a', "aren't", 'mrs', 'needn', 'because', 'same', 'those', 'below', 'on', "needn't", 'themselves', 'again', 'some', "couldn't", 'you', 'does', 'into', 'just', 'will', 'against', 'too', 'during', 'ain', 'than', 'while', "hadn't", 'whom', 'it', 'their', 'under', "it's", 'by', 'then', 'there', 'doesn', 'd', 'don', 'me', 'they', 'doing', 'ourselves', 'for', 'having', 'with', 'each', 'or', 'her', 'over', "you're", "you've", 'as', 'yours', 'itself', 'how'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Analysis and overview of metrics from both sample texts**


In [None]:
# Analyzing raw texts
# --------------------

# Get text metrics for raw unprocessed text
emma_metrics = text_metrics(emma_text)
tp001_metrics = text_metrics(tp001_text)

# Extracting top ten words and their frequencies for plotting
emma_top_ten_words, emma_frequencies = zip(*emma_metrics['top_ten_words'])
tp001_top_ten_words, tp001_frequencies = zip(*tp001_metrics['top_ten_words'])

# Extract the number of sentences
emma_num_sentences = emma_metrics['num_sentences']
tp001_num_sentences = tp001_metrics['num_sentences']

# Printing metrics for raw texts
print(format_metrics("Emma - Raw Text", emma_metrics))
print(format_metrics("TP001 - Raw Text", tp001_metrics))

--------- Text Metrics for Emma - Raw Text ---------
Total Words: 161983
Total Sentences: 7493
Overall Lexical Diversity: 4.48%
Average Lexical Diversity of Sentences: 90.89%
Top Ten Most Frequent Words: to, the, and, of, i, a, it, her, was, she
Highest Frequency Word: 'to' (Frequency: 5239)

--------- Text Metrics for TP001 - Raw Text ---------
Total Words: 39065
Total Sentences: 1431
Overall Lexical Diversity: 12.06%
Average Lexical Diversity of Sentences: 88.95%
Top Ten Most Frequent Words: the, to, we, and, i, it, a, is, be, for
Highest Frequency Word: 'the' (Frequency: 1507)



In [None]:
# Extracting keywords for preprocessing
print("\033[93mExtracting Keywords...\033[0m")

emma_keywords = extract_keywords(emma_text, 20)
print(emma_keywords['keywords'])

tp001_keywords = extract_keywords(tp001_text, 20)
print(tp001_keywords['keywords'])

emma_keyword_embeddings = emma_keywords['key_word_embeddings']
tp001_keyword_embeddings = tp001_keywords['key_word_embeddings']

print("\033[92mExtraction Successful\n\033[0m")

[93mExtracting Keywords...[0m
['emma', 'inheritance', 'extenuations', 'beaufet', 'unison', 'disinterestedness', 'refined', 'bounded', 'sooner', 'slumbering', 'residence', 'confirmed', '_try_', 'fever', 'gallantry', 'thankfully', 'unobjectionable', 'doctrines', 'plotting', 'invitations']
['renewables', 'alternatives', 'tecnology', 'earthfirfuturegeb', '2023', 'limit', 'inalterbative', 'agree', 'scoot', 'subsidising', 'react', 'extraction', 'diseases', 'carscontribute', 'enable', 'havent', 'pedestrians', 'sure', 'ahhhhhhhhhhhh', 'average']
[92mExtraction Successful
[0m


In [None]:
# Analyzing preprocessed texts
# ----------------------------

# def refine_text(text, keywords, stop_words, threshold=0.5):

preprocessed_tp001 = refine_text(tp001_text, tp001_keyword_embeddings, stop_words)
preprocessed_tp001_metrics = text_metrics(preprocessed_tp001)
print(format_metrics("TP001 - Preprocessed Text", preprocessed_tp001_metrics))

preprocessed_emma = refine_text(emma_text, emma_keyword_embeddings, stop_words)
preprocessed_emma_metrics = text_metrics(preprocessed_emma)
print(format_metrics("Emma - Preprocessed Text", preprocessed_emma_metrics))

# Extracting top ten words and their frequencies for preprocessed texts
preprocessed_emma_top_ten, preprocessed_emma_freq = zip(*preprocessed_emma_metrics['top_ten_words'])
preprocessed_tp001_top_ten, preprocessed_tp001_freq = zip(*preprocessed_tp001_metrics['top_ten_words'])

# Extract the number of sentences for preprocessed texts
preprocessed_emma_num_sentences = preprocessed_emma_metrics['num_sentences']
preprocessed_tp001_num_sentences = preprocessed_tp001_metrics['num_sentences']

--------- Text Metrics for TP001 - Preprocessed Text ---------
Total Words: 1255
Total Sentences: 1
Overall Lexical Diversity: 52.19%
Average Lexical Diversity of Sentences: 52.47%
Top Ten Most Frequent Words: cars, need, change, petrol, think, would, future, time, electric, good
Highest Frequency Word: 'cars' (Frequency: 22)



## **Visualization of The Top Ten Words with Their Frequencies Before and After Processing**

The visual comparison of word frequencies before and after text processing illustrates the shift from generic to specific language elements, informing the thematic interpretation of the text.


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Create a subplot figure with 2 rows and 2 columns
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Emma Before Processing',
        '"Petrol cars should be banned by 2030" Before Processing',
        'Emma After Processing',
        '"Petrol cars should be banned by 2030" After Processing')
)

# Original Emma
fig.add_trace(
    go.Bar(x=emma_top_ten_words, y=emma_frequencies),
    row=1, col=1
)

# Original TP001
fig.add_trace(
    go.Bar(x=tp001_top_ten_words, y=tp001_frequencies),
    row=1, col=2
)

# Preprocessed Emma
fig.add_trace(
    go.Bar(x=preprocessed_emma_top_ten, y=preprocessed_emma_freq),
    row=2, col=1
)

# Preprocessed TP001
fig.add_trace(
    go.Bar(x=preprocessed_tp001_top_ten, y=preprocessed_tp001_freq),
    row=2, col=2
)

# Update layout
fig.update_layout(
    title_text='Top Ten Words and Their Frequencies',
    showlegend=False,
    height=800, width=1200
)

# Customize axis labels
fig.update_xaxes(title_text='Words', row=1, col=1)
fig.update_xaxes(title_text='Words', row=1, col=2)
fig.update_xaxes(title_text='Words', row=2, col=1)
fig.update_xaxes(title_text='Words', row=2, col=2)
fig.update_yaxes(title_text='Occurrence Frequency', col=1)
fig.update_yaxes(title_text='Occurrence Frequency', col=2)

# Show the figure
fig.show()

## **Comparative Analysis of Overall Lexical Diversity in Processed and Unprocessed Versions of "Emma" by Jane Austen.**

The results of the analysis below show the overall lexical diversity of Jane Austen's "Emma" in both its processed and unprocessed forms as the batch size (number of sentences per batch) increases. It demonstrates how lexical diversity decreases as the size of the text increases.

**Processed Overall Lexical Diversity (Blue):** As the batch size increases, we observe a gradual decrease in lexical diversity for the processed version. This trend indicates that when analyzing larger portions of the text together, the processed version becomes less lexically diverse. This may be attributed to the removal of high-frequency words during processing, which results in a more focused vocabulary.

**Unprocessed Overall Lexical Diversity (Red):** The red line represents the lexical diversity of the original, unprocessed text. Similarly, as the batch size increases, we also see a decrease in lexical diversity for the unprocessed version. This decrease suggests that even in the unprocessed text, certain words become more prominent and repetitive when analyzing larger sections of the text. However, the unprocessed text tends to maintain a higher lexical diversity compared to the processed version, as it retains all words without filtering.

In summary, the overall lexical diversity highlights how text processing affects the diversity of words in a text, particularly as the scale increases. While both processed and unprocessed texts exhibit a decrease in lexical diversity with larger batch sizes, the processed version tends to show a more pronounced decrease due to the removal of high-frequency words. Understanding these trends can aid in choosing the appropriate preprocessing strategy based on the specific goals of text analysis.

In [None]:
increment = 50  # n sentences per increment
batch_sizes = list(range(1, emma_num_sentences, increment))  # Incrementally increase batch size

overall_ld_unprocessed = []
overall_ld_processed = []

# Split the text into sentences
sentences = sent_tokenize(emma_text)

# Calculate lexical diversities
for batch_size in batch_sizes:
    unprocessed = ' '.join(sentences[:batch_size])
    processed = preprocess_text(unprocessed, stop_words)

    # Extract words and calculate lexical diversity
    unprocessed_words = re.findall(r'\b\w+\b', unprocessed.lower())
    processed_words = re.findall(r'\b\w+\b', processed.lower())

    unprocessed_diversity = overall_lexical_diversity(unprocessed_words)
    processed_diversity = overall_lexical_diversity(processed_words)

    overall_ld_unprocessed.append(unprocessed_diversity)
    overall_ld_processed.append(processed_diversity)

In [None]:
# Prepare batch size labels with sentence count
batch_size_labels = [batch_size for batch_size in batch_sizes]

# Convert lexical diversity to percentages
processed_lex_div = [ld * 100 for ld in overall_ld_processed]
unprocessed_lex_div = [ld * 100 for ld in overall_ld_unprocessed]

# Create traces
trace1 = go.Scatter(
    x=batch_size_labels,
    y=processed_lex_div,
    mode='lines',
    name='Overall Lexical Diversity (Processed)',
)
trace2 = go.Scatter(
    x=batch_size_labels,
    y=unprocessed_lex_div,
    mode='lines',
    name='Overall Lexical Diversity (Unprocessed)',
)

# Layout
layout = go.Layout(
    title='Overall Lexical Diversity over Increments of ' + str(increment) + ' Sentences',
    xaxis=dict(title='Number of Sentences'),
    yaxis=dict(title='Lexical Diversity (%)'),
)

# Figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

# Show plot
fig.show()

# Print metrics (assuming format_metrics is defined)
print("\n" + format_metrics("Emma (Unprocessed)", emma_metrics) + "\n")
print(format_metrics("Emma (Processed)", preprocessed_emma_metrics) + "\n")