In [1]:
!pip install transformers datasets evaluate scikit-learn


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m491.2/491.2 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


**We will copy from our repository the functions we need!**

In [1]:
import glob
import html
import os

#from config import DATA_PATH #load path for data files in the file config
DATA_PATH = "/content/drive/MyDrive/NLP_PROJECT2"




SEPARATOR = "\t"


def clean_text(text):
    """
    Remove extra quotes from text files and html entities
    Argumentss:
        text (str): a string of text

    Returns: (str): the "cleaned" text

    """
    text = text.rstrip() #removes spaces, tabs, new lines etc FROM THE END of the text

    if '""' in text: #if we have useless double "  we clean them (""somthing"")
        if text[0] == text[-1] == '"':
            text = text[1:-1] #take the string without the first and last char (remove ")
        text = text.replace('\\""', '"')
        text = text.replace('""', '"') #double ""  replaced by "

    text = text.replace('\\""', '"')

    text = html.unescape(text) #convert HTML entities into characters ex. &lt; ‚Üí <
    text = ' '.join(text.split()) #splits the text ignoring many spaces and ' '.join()
                                  # joins them with only one space
    return text


def parse_file(file):
    """
    Read a file and return a dictionary of the data, in the format:
    tweet_id:{sentiment, text}
    """

    data = {}
    lines = open(file, "r", encoding="utf-8").readlines() #read all lines of the file (utf-8 for special chars)
    for _, line in enumerate(lines):
        columns = line.rstrip().split(SEPARATOR) #seperator = \t =  tab so we get each column value which is seperated by tab
        tweet_id = columns[0]
        sentiment = columns[1]
        text = columns[2:] #everything from the 3rd column and after (text)
        text = clean_text(" ".join(text)) #we set text as an element not a list of elements and clean it
                                          #in other words text becomes a string instead of a list
        data[tweet_id] = (sentiment, text) #(emotion, text)
    return data


def load_from_dir(path):
    """
    Searches for all the .tsv and .txt files in a folder

    """

    #search inside folders and subfolders
    files = glob.glob(path + "/**/*.tsv", recursive=True)
    files.extend(glob.glob(path + "/**/*.txt", recursive=True))

    data = {}  # use dict, in order to avoid having duplicate tweets (same id)
               #dictionary will replace the duplicate key
    for file in files:
        file_data = parse_file(file)
        data.update(file_data)
    return list(data.values()) #list of tuples (sentiment, text) --> dont care about the keys





def load_Semeval2017A():
    """
    Loads data from dataset Semeval2017A
    """


    train = load_from_dir(os.path.join(DATA_PATH, "Semeval2017A/train_dev"))
    test = load_from_dir(os.path.join(DATA_PATH, "Semeval2017A/gold"))

    X_train = [x[1] for x in train]
    y_train = [x[0] for x in train]
    X_test = [x[1] for x in test]
    y_test = [x[0] for x in test]

    return X_train, y_train, X_test, y_test





def load_MR():
    with open(os.path.join(DATA_PATH, "MR/rt-polarity.pos"), encoding="utf-8", errors="replace") as f:
        pos = f.readlines()

    with open(os.path.join(DATA_PATH, "MR/rt-polarity.neg"), encoding="utf-8", errors="replace") as f:
        neg = f.readlines()

    pos = [x.strip() for x in pos]
    neg = [x.strip() for x in neg]

    pos_labels = ["positive"] * len(pos)
    neg_labels = ["negative"] * len(neg)

    split = 5000

    X_train = pos[:split] + neg[:split]
    y_train = pos_labels[:split] + neg_labels[:split]

    X_test = pos[split:] + neg[split:]
    y_test = pos_labels[split:] + neg_labels[split:]

    return X_train, y_train, X_test, y_test

# MAIN CODE


In [3]:
import numpy as np
import evaluate
import torch
from sklearn.metrics import accuracy_score
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder

In [12]:
# ==== METRICS ====
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# ==== TOKENIZATION ====
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

def prepare_dataset(X, y):
    return Dataset.from_dict({'text': X, 'label': y})

# **We will start with MR dataset.**



In [5]:
# ==== LOAD DATA ====
DATASET = "MR"
X_train, y_train, X_test, y_test = load_MR()


# Label encoding
le = LabelEncoder()
le.fit(list(set(y_train)))
y_train = le.transform(y_train)
y_test = le.transform(y_test)
n_classes = len(le.classes_)

# Prepare datasets
train_set = prepare_dataset(X_train, y_train)
test_set = prepare_dataset(X_test, y_test)




**First model: siebert/sentiment-roberta-large-english**




In [6]:
import torch
torch.cuda.empty_cache()



PRETRAINED_MODEL = 'siebert/sentiment-roberta-large-english'
os.environ["WANDB_DISABLED"] = "true" #we asked for API key to have wandb -> visualization of training loss
                                      #so we disable it


# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=n_classes)

# Tokenize data
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_test_set = test_set.map(tokenize_function, batched=True)

# (Optional) Subsample Œ≥ŒπŒ± debug
n_samples = 100
small_train_dataset = tokenized_train_set.shuffle(seed=42).select(range(n_samples))
small_eval_dataset = tokenized_test_set.shuffle(seed=42).select(range(n_samples))


# ==== TRAINING SETUP ====
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=10,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# ==== FINAL EVALUATION ON FULL TEST SET ====
predictions = trainer.predict(tokenized_test_set)
pred_labels = np.argmax(predictions.predictions, axis=-1)
acc = accuracy_score(y_test, pred_labels)
print(f"Model: {PRETRAINED_MODEL}")
print(f"\nFinal Test Accuracy: {acc:.4f}")


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/662 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6898,0.676394,0.7
2,0.6866,0.64782,0.81
3,0.3455,0.666962,0.82
4,0.2937,0.324012,0.91
5,0.2481,0.403266,0.89


Model: siebert/sentiment-roberta-large-english

Final Test Accuracy: 0.8625


**Second model: distilbert-base-uncased-finetuned-sst-2-english**

In [7]:
import torch
torch.cuda.empty_cache()



PRETRAINED_MODEL = 'distilbert-base-uncased-finetuned-sst-2-english'
os.environ["WANDB_DISABLED"] = "true" #we asked for API key to have wandb -> visualization of training loss
                                      #so we disable it


# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=n_classes)

# Tokenize data
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_test_set = test_set.map(tokenize_function, batched=True)

# (Optional) Subsample Œ≥ŒπŒ± debug
n_samples = 100
small_train_dataset = tokenized_train_set.shuffle(seed=42).select(range(n_samples))
small_eval_dataset = tokenized_test_set.shuffle(seed=42).select(range(n_samples))


# ==== TRAINING SETUP ====
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=10,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# ==== FINAL EVALUATION ON FULL TEST SET ====
predictions = trainer.predict(tokenized_test_set)
pred_labels = np.argmax(predictions.predictions, axis=-1)
acc = accuracy_score(y_test, pred_labels)
print(f"Model: {PRETRAINED_MODEL}")
print(f"\nFinal Test Accuracy: {acc:.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/662 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5326,0.397851,0.87
2,0.2819,0.451669,0.87
3,0.0969,0.547714,0.87
4,0.0012,0.595341,0.87
5,0.0008,0.60652,0.87


Model: distilbert-base-uncased-finetuned-sst-2-english

Final Test Accuracy: 0.8958


Third model: textattack/bert-base-uncased-SST-2

In [8]:
import torch
torch.cuda.empty_cache()



PRETRAINED_MODEL = 'textattack/bert-base-uncased-SST-2'
os.environ["WANDB_DISABLED"] = "true" #we asked for API key to have wandb -> visualization of training loss
                                      #so we disable it


# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=n_classes)

# Tokenize data
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_test_set = test_set.map(tokenize_function, batched=True)

# (Optional) Subsample Œ≥ŒπŒ± debug
n_samples = 100
small_train_dataset = tokenized_train_set.shuffle(seed=42).select(range(n_samples))
small_eval_dataset = tokenized_test_set.shuffle(seed=42).select(range(n_samples))


# ==== TRAINING SETUP ====
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=10,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# ==== FINAL EVALUATION ON FULL TEST SET ====
predictions = trainer.predict(tokenized_test_set)
pred_labels = np.argmax(predictions.predictions, axis=-1)
acc = accuracy_score(y_test, pred_labels)
print(f"Model: {PRETRAINED_MODEL}")
print(f"\nFinal Test Accuracy: {acc:.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/662 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4849,0.48236,0.84
2,0.2777,0.408721,0.87
3,0.0589,0.478676,0.91
4,0.0008,0.576896,0.88
5,0.0005,0.592327,0.87


Model: textattack/bert-base-uncased-SST-2

Final Test Accuracy: 0.8837


# **We will continue with Semeval dataset.**

In [9]:
DATASET = "Semeval2017A"
X_train, y_train, X_test, y_test = load_Semeval2017A()


# Label encoding
le = LabelEncoder()
le.fit(list(set(y_train)))
y_train = le.transform(y_train)
y_test = le.transform(y_test)
n_classes = len(le.classes_)

# Prepare datasets
train_set = prepare_dataset(X_train, y_train)
test_set = prepare_dataset(X_test, y_test)

**First model: cardiffnlp/twitter-roberta-base-sentiment**



In [13]:
import torch
torch.cuda.empty_cache()



PRETRAINED_MODEL = 'cardiffnlp/twitter-roberta-base-sentiment'
os.environ["WANDB_DISABLED"] = "true" #we asked for API key to have wandb -> visualization of training loss
                                      #so we disable it


# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=n_classes)

# Tokenize data
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_test_set = test_set.map(tokenize_function, batched=True)

# (Optional) Subsample Œ≥ŒπŒ± debug
n_samples = 100
small_train_dataset = tokenized_train_set.shuffle(seed=42).select(range(n_samples))
small_eval_dataset = tokenized_test_set.shuffle(seed=42).select(range(n_samples))


# ==== TRAINING SETUP ====
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=10,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# ==== FINAL EVALUATION ON FULL TEST SET ====
predictions = trainer.predict(tokenized_test_set)
pred_labels = np.argmax(predictions.predictions, axis=-1)
acc = accuracy_score(y_test, pred_labels)
print(f"Model: {PRETRAINED_MODEL}")
print(f"\nFinal Test Accuracy: {acc:.4f}")

Map:   0%|          | 0/49570 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,0.646,0.721568,0.67
2,0.5291,1.054714,0.62
3,0.1427,1.75302,0.58
4,0.0102,2.084121,0.58
5,0.0341,2.04836,0.58


Model: cardiffnlp/twitter-roberta-base-sentiment

Final Test Accuracy: 0.6903


**Second model: finiteautomata/bertweet-base-sentiment-analysis**

In [14]:
import torch
torch.cuda.empty_cache()



PRETRAINED_MODEL = 'finiteautomata/bertweet-base-sentiment-analysis'
os.environ["WANDB_DISABLED"] = "true" #we asked for API key to have wandb -> visualization of training loss
                                      #so we disable it


# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=n_classes)

# Tokenize data
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_test_set = test_set.map(tokenize_function, batched=True)

# (Optional) Subsample Œ≥ŒπŒ± debug
n_samples = 100
small_train_dataset = tokenized_train_set.shuffle(seed=42).select(range(n_samples))
small_eval_dataset = tokenized_test_set.shuffle(seed=42).select(range(n_samples))


# ==== TRAINING SETUP ====
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=10,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# ==== FINAL EVALUATION ON FULL TEST SET ====
predictions = trainer.predict(tokenized_test_set)
pred_labels = np.argmax(predictions.predictions, axis=-1)
acc = accuracy_score(y_test, pred_labels)
print(f"Model: {PRETRAINED_MODEL}")
print(f"\nFinal Test Accuracy: {acc:.4f}")

tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Map:   0%|          | 0/49570 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5085,0.943131,0.65
2,0.3087,0.682151,0.76
3,0.2628,1.058799,0.72
4,0.0257,1.020295,0.74
5,0.01,1.042668,0.74


Model: finiteautomata/bertweet-base-sentiment-analysis

Final Test Accuracy: 0.7077


**Third model: yiyanghkust/finbert-tone**

In [15]:
import torch
torch.cuda.empty_cache()



PRETRAINED_MODEL = 'yiyanghkust/finbert-tone'
os.environ["WANDB_DISABLED"] = "true" #we asked for API key to have wandb -> visualization of training loss
                                      #so we disable it


# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=n_classes)

# Tokenize data
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_test_set = test_set.map(tokenize_function, batched=True)

# (Optional) Subsample Œ≥ŒπŒ± debug
n_samples = 100
small_train_dataset = tokenized_train_set.shuffle(seed=42).select(range(n_samples))
small_eval_dataset = tokenized_test_set.shuffle(seed=42).select(range(n_samples))


# ==== TRAINING SETUP ====
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=10,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# ==== FINAL EVALUATION ON FULL TEST SET ====
predictions = trainer.predict(tokenized_test_set)
pred_labels = np.argmax(predictions.predictions, axis=-1)
acc = accuracy_score(y_test, pred_labels)
print(f"Model: {PRETRAINED_MODEL}")
print(f"\nFinal Test Accuracy: {acc:.4f}")

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Map:   0%|          | 0/49570 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,2.8248,1.337254,0.43
2,1.0735,1.67164,0.28
3,0.6868,1.825149,0.44
4,0.1335,2.011916,0.5
5,0.0875,2.12194,0.42


Model: yiyanghkust/finbert-tone

Final Test Accuracy: 0.4141
