<a href="https://colab.research.google.com/github/RamonSaturninoM/NLP_TextSegmentation/blob/master/nlpSegmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CSC 446/646: Natural Language Processing, Assignment 1

In [55]:
import zipfile
import os
import spacy
import nltk
import numpy as np
from textblob import TextBlob
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
nlp = spacy.load("en_core_web_sm")

Question 1: Rule-based NLP

In [62]:
# unzip dataset
def extract_dataset(path, extract_to="dataset"):
  with zipfile.ZipFile(path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)
  files = os.listdir(extract_to)
  return [os.path.join(extract_to, file) for file in files if file.endswith('.txt')]

  """
  To extract the files, I zipped all .txt files into a single folder for better management in my program.
  comments1k.zip might not work if trying to use this function because it have 2 folders after extracting.
  """

In [None]:
# run the rule based nlp
def process_reviews(files):
    total_sentences = 0
    total_tokens = 0
    total_words_no_stop_punct = 0
    total_comments = 0

    ps = PorterStemmer() # start stemming
    lemmatizer = WordNetLemmatizer()  # start lemmatization
    stop_words = set(stopwords.words('english'))

    stemmed_words_l = []
    lemmatized_words_l = []
    original_words_l = []

    for file_path in files:
        with open(file_path, 'r', encoding='utf-8') as f:
            comments = f.read().strip().split("\n")  # reads entire file and splits by newlines

        comments = [comment.strip() for comment in comments if comment.strip()]  # remove empty lines

        total_comments += len(comments)  # count total comments

        for comment in comments:
            sentences = sent_tokenize(comment)  # sentence splitting
            tokens = word_tokenize(comment)  # tokenization
            words_filtered = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]  # remove the stop words condition

            total_sentences += len(sentences)
            total_tokens += len(tokens)
            total_words_no_stop_punct += len(words_filtered)

            # process for stemming and lemmatization
            stemmed_words = [ps.stem(word) for word in words_filtered]
            lemmatized_words = [lemmatizer.lemmatize(word) for word in words_filtered]

    # store first words for observation and comparison
    original_words_l.extend(words_filtered[:8])
    stemmed_words_l.extend(stemmed_words[:8])
    lemmatized_words_l.extend(lemmatized_words[:8])

    # compute overall averages
    avg_sentences = total_sentences / total_comments if total_comments > 0 else 0
    avg_tokens = total_tokens / total_comments if total_comments > 0 else 0
    avg_words_no_stop_punct = total_words_no_stop_punct / total_comments if total_comments > 0 else 0


    # print results
    print(f"\nTotal reviews: {total_comments:.2f}")
    print(f"Avg. sentences per comment: {avg_sentences:.2f}")
    print(f"Avg. tokens per comment: {avg_tokens:.2f}")
    print(f"Avg. words (no stop word/punctuation): {avg_words_no_stop_punct:.2f}")

    print("\nDifferences between Lemmatization and Stemming: ")
    print(f"Original Words: {original_words_l}")
    print(f"Stemmed Words: {stemmed_words_l}")
    print(f"Lemmatized Words: {lemmatized_words_l}")

    return comments




In [None]:
data_set = extract_dataset('/content/reviews.zip')
print(f"Extracted files: {data_set}")

if data_set:
  process_reviews(data_set)


Extracted files: ['dataset/501_10.txt', 'dataset/638_10.txt', 'dataset/534_10.txt', 'dataset/142_8.txt', 'dataset/477_10.txt', 'dataset/346_10.txt', 'dataset/429_10.txt', 'dataset/926_7.txt', 'dataset/970_10.txt', 'dataset/622_10.txt', 'dataset/101_8.txt', 'dataset/897_10.txt', 'dataset/59_7.txt', 'dataset/676_8.txt', 'dataset/507_10.txt', 'dataset/629_9.txt', 'dataset/928_10.txt', 'dataset/143_7.txt', 'dataset/997_7.txt', 'dataset/6_10.txt', 'dataset/961_9.txt', 'dataset/574_7.txt', 'dataset/90_7.txt', 'dataset/923_9.txt', 'dataset/687_9.txt', 'dataset/313_10.txt', 'dataset/602_10.txt', 'dataset/387_8.txt', 'dataset/338_10.txt', 'dataset/770_10.txt', 'dataset/606_10.txt', 'dataset/92_9.txt', 'dataset/107_10.txt', 'dataset/761_10.txt', 'dataset/731_9.txt', 'dataset/662_8.txt', 'dataset/62_10.txt', 'dataset/120_8.txt', 'dataset/762_9.txt', 'dataset/331_10.txt', 'dataset/439_9.txt', 'dataset/122_9.txt', 'dataset/908_8.txt', 'dataset/390_10.txt', 'dataset/157_9.txt', 'dataset/236_9.txt', 

Question 2: Machine Learning Basics

In [None]:
def train_neural_network(x1, x2, t, w, eta=0.1):
  """
  Parameters:
    - Inputs: x1, x2
    - Target: t
    - Weights: w
    - Learning rate: eta
  """

  w1, w2, w3, w4, w5, w6 = w

  # feed-forward process
  h1 = (w1*x1) + (w2*x2)
  h2 = (w3*x1) + (w4*x2)
  y  = (w5*h1) + (w6*h2)

  # compute error after the first epoch
  E = 0.5 * (y - t) **2

  # gradients
  dy = y - t

  # output layer
  dw5 = dy * h1
  dw6 = dy * h2

  # hidden layers weights
  dh1 = w5 * dy
  dh2 = w6 * dy

  dw1 = dh1 * x1
  dw2 = dh1 * x2
  dw3 = dh2 * x1
  dw4 = dh2 * x2

  # update weights based on gradient descent
  w1 -= eta * dw1
  w2 -= eta * dw2
  w3 -= eta * dw3
  w4 -= eta * dw4
  w5 -= eta * dw5
  w6 -= eta * dw6

  updated_weights = [w1, w2, w3, w4, w5, w6]

  # run new error
  h1_new = (w1*x1) + (w2*x2)
  h2_new = (w3*x1) + (w4*x2)
  y_new  = (w5*h1_new) + (w6*h2_new)
  E_new = 0.5 * (y_new - t) **2

  return updated_weights, E, E_new

In [None]:
x1, x2 = 1, 0.5
t = 4
weights = [0.5, 1.5, 2.3, 3, 1, 1]

updated_weights, error, new_error = train_neural_network(x1, x2, t, weights)
print(f"Initial Error: {error}")
print(f"Updated Error After Weight Update: {new_error}")
print(f"\nInitial Weights: {weights}")
print(f"Updated Weights: {updated_weights}")

if new_error < error:
  print("\nThe error decreased after the weight update.")
else:
  print("\nThe error did not decrease after the weight update.")

Initial Error: 0.5512499999999998
Updated Error After Weight Update: 0.3388021092883306

Initial Weights: [0.5, 1.5, 2.3, 3, 1, 1]
Updated Weights: [0.395, 1.4475, 2.195, 2.9475, 0.86875, 0.601]

The error decreased after the weight update.


Question 3: Text Annotation
Part 1: Entity and Sentiment Annotation

In [None]:
# entity annotation
def named_entity_annotation(text):
  doc = nlp(text)
  entities = [(ent.text, ent.label_) for ent in doc.ents]

  with open('ner_annotations.txt', 'w') as f:
    for entity, label in entities:
      f.write(f"{entity} - {label}\n")

  return entities

In [None]:
# sentiment annotation
def sentiment_annotation(text):
  blob = TextBlob(text)
  sentiment_score = blob.sentiment.polarity
  sentiment_label = "Positive" if sentiment_score > 0 else "Negative" if sentiment_score < 0 else "Neutral"

  with open('sentiment_annotations.txt', 'w') as f:
    f.write(f"Sentiment: {sentiment_label} Score: {sentiment_score:.3f}\n")

  return sentiment_label, sentiment_score

In [54]:
# provided texts
ner_text = "Barack Obama was the 44th President of the United States. He was born in Hawaii and studied law at Harvard University."
sentiment_text = """De Niro has the ability to make every role he portrays into acting gold.
    He gives a great performance in this film and there is a great scene where he has to take his
    father to a home for elderly people because he can't care for him anymore that will break your heart.
    I will say you won't see much bette acting anywhere."""

ner_results = named_entity_annotation(ner_text)
print("\nNamed Entity Recognition Results:")
for entity, label in ner_results:
  print(f"{entity} - {label}")

sentiment_label, sentiment_score = sentiment_annotation(sentiment_text)
print("\nSentiment Analysis Results:")
print(f"Sentiment: {sentiment_label}, Score: {sentiment_score:.3f}")


Named Entity Recognition Results:
Barack Obama - PERSON
44th - ORDINAL
the United States - GPE
Hawaii - GPE
Harvard University - ORG

Sentiment Analysis Results:
Sentiment: Positive, Score: 0.360


Part 2: Active Learning

In [56]:
# Generate a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
# Split the dataset into initial training set and pool set
X_train, X_pool, y_train, y_pool = train_test_split(X, y, test_size=0.9, random_state=42)
# Initialize the active learning loop
iterations = 10
batch_size = 10
model = LogisticRegression(random_state=42)
for i in range(iterations):
 print("Iteration {}:".format(i+1))

 # Train the model on the current training set
 model.fit(X_train, y_train)

 # Predict the labels of the unlabeled instances in the pool set
 y_pool_pred = model.predict(X_pool)

 ### below
 y_pool_prob = model.predict_proba(X_pool)
 entropy = -np.sum(y_pool_prob * np.log(y_pool_prob), axis=1)
 query_idx = np.argsort(entropy)[-batch_size:]
 ### above

 X_query = X_pool[query_idx]
 y_query = y_pool[query_idx]
 # Add the labeled instances to the training set and remove them from the pool set
 X_train = np.concatenate([X_train, X_query])
 y_train = np.concatenate([y_train, y_query])
 X_pool = np.delete(X_pool, query_idx, axis=0)
 y_pool = np.delete(y_pool, query_idx)
 # Compute and print the accuracy of the model on the test set
 y_test_pred = model.predict(X_pool)
 accuracy = accuracy_score(y_pool, y_test_pred)
 print("Accuracy: {:.3f}\n".format(accuracy))

Iteration 1:
Accuracy: 0.828

Iteration 2:
Accuracy: 0.834

Iteration 3:
Accuracy: 0.851

Iteration 4:
Accuracy: 0.864

Iteration 5:
Accuracy: 0.874

Iteration 6:
Accuracy: 0.879

Iteration 7:
Accuracy: 0.881

Iteration 8:
Accuracy: 0.883

Iteration 9:
Accuracy: 0.886

Iteration 10:
Accuracy: 0.894



In [60]:
 """
 2a)
 y_pool_prob = model.predict_proba(X_pool) # ----> This line of code gets probability scores for each instance in pool set.
 entropy = -np.sum(y_pool_prob * np.log(y_pool_prob), axis=1) # ----> Takes measurement of uncertainty of each prediction.
 query_idx = np.argsort(entropy)[-batch_size:] # ----> Selects the batch_size samples with highest uncertainty and let the model learn from them.

  ** This approach focuses on samples where the model is most uncertain. With this approach it helps improve the model's performance.
  Instead of labeling easy samples, it takes the most uncertain ones.
 """

"\ny_pool_prob = model.predict_proba(X_pool) # ----> This line of code gets probability scores for each instance in pool set.\nentropy = -np.sum(y_pool_prob * np.log(y_pool_prob), axis=1) # ----> Takes measurement of uncertainty of each prediction.\nquery_idx = np.argsort(entropy)[-batch_size:] # ----> Selects the batch_size samples with highest uncertainty and let the model learn from them.\n\n ** This approach focuses on samples where the model is most uncertain. With this approach it helps improve the model's performance.\n Instead of labeling easy samples, it takes the most uncertain ones.\n"

In [59]:
### Alternative Strategy: Least Confidence Sampling

y_pool_prob = model.predict_proba(X_pool)
max_confidence = np.max(y_pool_prob, axis=1)  # get max probability for each instance
query_idx = np.argsort(max_confidence)[:batch_size]  # select lowest confidence samples
"""
2a) ** The original approach (Entropy Sampling) is better when the dataset is imbalanced because it ensures the model focuses on
different hard cases.
The new strategy proposed (Least Confidence Sampling) is faster and works better when computational efficiency is a concern.

2b) ** It depends on the batch_size, in this case would be 10. The pros of having smaller labelled reviews is that there is a
more focused learning, it's also less work for the annotator. In terms of cons, it would require more iterations to achieve
good performance, which will make the process a bit more slower.

The pros of having a larger labelled reviews is that there is a faster convergence because the model can learn from a larger set
of labelled data, hence fewer iterations will be needed. In the cons we might have more manual work, which involves more
annotations per iteration.

"""

'\n** The original approach (Entropy Sampling) is better when the dataset is imbalanced because it ensures the model focuses on \ndifferent hard cases.\nThe new strategy proposed (Least Confidence Sampling) is faster and works better when computational efficiency is a concern.\n\n'