In [3]:
#https://huggingface.co/docs/transformers/model_doc/roberta

## 1. Important library 

In [4]:
#!pip install transformers==3.0.2

In [5]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

  from pandas.core import (


In [6]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## 2. Important dataset

In [7]:
import re

# Initialize lists to store the separated sentiments and phrases
sentiments = []
phrases = []

# Open the file and read line by line
with open('train.ft.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Extract sentiment and phrase using regex
        match = re.match(r'(__label__\d) (.*)', line)
        "label_1 means negative (0); label_2 means positive(1)"
        if match:
            sentiments.append(match.group(1).replace('__label__1', '0').replace('__label__2', '1'))
            phrases.append(match.group(2))

# Create a DataFrame from the lists
train = pd.DataFrame({
    'sentiment': sentiments,
    'phrase': phrases
})

# Check the first few rows to ensure it's loaded correctly
print(train.head())

  sentiment                                             phrase
0         1  Stuning even for the non-gamer: This sound tra...
1         1  The best soundtrack ever to anything.: I'm rea...
2         1  Amazing!: This soundtrack is my favorite music...
3         1  Excellent Soundtrack: I truly like this soundt...
4         1  Remember, Pull Your Jaw Off The Floor After He...


In [8]:
train.shape

(3600000, 2)

In [9]:
train.head()

Unnamed: 0,sentiment,phrase
0,1,Stuning even for the non-gamer: This sound tra...
1,1,The best soundtrack ever to anything.: I'm rea...
2,1,Amazing!: This soundtrack is my favorite music...
3,1,Excellent Soundtrack: I truly like this soundt...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."


In [10]:
# Again 0 stands for negative, 1 means positive
train['sentiment'].unique()

array(['1', '0'], dtype=object)

In [11]:
train.describe()

Unnamed: 0,sentiment,phrase
count,3600000,3600000
unique,2,3600000
top,1,Stuning even for the non-gamer: This sound tra...
freq,1800000,1


In [12]:
# Import test

import re

# Initialize lists to store the separated sentiments and phrases
sentiments = []
phrases = []

# Open the file and read line by line
with open('test.ft.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Extract sentiment and phrase using regex
        match = re.match(r'(__label__\d) (.*)', line)
        "label_1 means negative (0); label_2 means positive(1)"
        if match:
            sentiments.append(match.group(1).replace('__label__1', '0').replace('__label__2', '1'))
            phrases.append(match.group(2))

# Create a DataFrame from the lists
test = pd.DataFrame({
    'sentiment': sentiments,
    'phrase': phrases
})

# Check the first few rows to ensure it's loaded correctly
print(test.head())

  sentiment                                             phrase
0         1  Great CD: My lovely Pat has one of the GREAT v...
1         1  One of the best game music soundtracks - for a...
2         0  Batteries died within a year ...: I bought thi...
3         1  works fine, but Maha Energy is better: Check o...
4         1  Great for the non-audiophile: Reviewed quite a...


In [13]:
test.shape

(400000, 2)

In [14]:
test.head()

Unnamed: 0,sentiment,phrase
0,1,Great CD: My lovely Pat has one of the GREAT v...
1,1,One of the best game music soundtracks - for a...
2,0,Batteries died within a year ...: I bought thi...
3,1,"works fine, but Maha Energy is better: Check o..."
4,1,Great for the non-audiophile: Reviewed quite a...


In [15]:
test['sentiment'].unique()

array(['1', '0'], dtype=object)

## 3. Preparing the Dataset and Dataloader

I will start with defining few key variables that will be used later during the training/fine tuning stage. Followed by creation of Dataset class - This defines how the text is pre-processed before sending it to the neural network. I will also define the Dataloader that will feed the data in batches to the neural network for suitable training and processing. Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network. For further reading into Dataset and Dataloader read the docs at PyTorch

SentimentData Dataset Class

This class is defined to accept the Dataframe as input and generate tokenized output that is used by the Roberta model for training.
I am using the Roberta tokenizer to tokenize the data in the TITLE column of the dataframe.
The tokenizer uses the encode_plus method to perform tokenization and generate the necessary outputs, namely: ids, attention_mask

To read further into the tokenizer, refer to this document
target is the encoded category on the news headline.
The SentimentData class is used to create 2 datasets, for training and for validation.
Training Dataset is used to fine tune the model: 80% of the original data
Validation Dataset is used to evaluate the performance of the model. The model has not seen this data during training.

Dataloader
Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
This control is achieved using the parameters such as batch_size and max_len.
Training and Validation dataloaders are used in the training and validation part of the flow respectively



In [16]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [17]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.phrase
        self.targets = self.data.sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [18]:
train_data = train
test_data=test

print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

TRAIN Dataset: (3600000, 2)
TEST Dataset: (400000, 2)


In [19]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## 4 Creating the Neural Network for Fine Tuning


Neural Network
We will be creating a neural network with the RobertaClass.
This network will have the Roberta Language model followed by a dropout and finally a Linear layer to obtain the final outputs.
The data will be fed to the Roberta Language model as defined in the dataset.
Final layer outputs is what will be compared to the Sentiment category to determine the accuracy of models prediction.
We will initiate an instance of the network called model. This instance will be used for training and then to save the final trained model for future inference.
Loss Function and Optimizer
Loss Function and Optimizer and defined in the next cell.
The Loss Function is used the calculate the difference in the output created by the model and the actual output.
Optimizer is used to update the weights of the neural network to improve its performance.

In [20]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

In [None]:
#!pip install zstandard
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
#data_files = "/Users/szuyingpan/Desktop/NLP/CW1/train.ft.txt"
train = load_dataset("text", data_files=data_files, split="train")
train

In [None]:
python_version_

In [None]:
# sample
train[0]

In [None]:
#!pip install psutil
import psutil

In [None]:
# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

In [None]:
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "/Users/szuyingpan/Desktop/NLP/CW1/test.ft.txt"
test = load_dataset("text", data_files=data_files, split="train")
test

In [None]:
# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

In [None]:
# the rss attribute refers to the resident set size, which is the fraction of memory that a process occupies 
#in RAM. This measurement also includes the memory used by the Python interpreter and the libraries we’ve 
#loaded, so the actual amount of memory used to load the dataset is a bit smaller. 

In [None]:
# The size of train dataset
print(f"Number of files in dataset : {train.dataset_size}")
size_gb = train.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

In [None]:
# The size of test dataset
print(f"Number of files in dataset : {test.dataset_size}")
size_gb = test.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

In [None]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(train), batch_size):
    _ = train[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(train)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

In [None]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(test), batch_size):
    _ = test[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(test)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

In [None]:
# Iterate over it one example at a time without loading the entire dataset into memory.
train_streamed = load_dataset(
    "text", data_files=data_files, split="train", streaming=True
)

In [None]:
next(iter(train_streamed))

In [None]:
NUMBER_OF_LINES = 10000

data = {}

# read the archived file line by line, add add it to a map
for i, line in enumerate("train_streamed", "rt", encoding="utf8")):

    if i == NUMBER_OF_LINES:
        break

    # label 1 is negativ and label 2 is positive
    label = 1 if line[:10] == "__label__1" else 2
    text = line[10:]

    localResult = {
        "label": label,
        "text": text
    }

    data[i] = localResult


df = pd.DataFrame(data).T
df = df.reset_index().rename(columns= {"index": "Id"})

To work as sentiment analysis, we need to preprocess the text to extract the label and review text separately and then apply further preprocessing like tokenization or stop words removal as needed.

In [None]:
def preprocess_streamed_data(example):
    # Extract label (assuming label 1 is negative and label 2 is positive)
    label = 1 if example['text'].startswith("__label__1") else 2
    # Remove the label from the text and any leading/trailing whitespace
    text = example['text'][10:].strip()
    return {"label": label, "text": text}

# Use a generator expression to apply preprocessing
preprocessed_stream = (preprocess_streamed_data(example) for example in train_streamed)

# Example: Access the first preprocessed example
print(next(preprocessed_stream))

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Assuming NLTK data has been downloaded
stop_words = set(stopwords.words('english'))
stop_words.remove('not')  # Keep "not" for sentiment analysis

def clean_text(text):
    """Remove URLs, HTML tags, and punctuation from text."""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    return text

def remove_stopwords_and_punct(text):
    """Tokenize text and remove stopwords and punctuation, return as a single string."""
    tokens = word_tokenize(text)
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return " ".join(filtered_tokens)

def preprocess_streamed_data(example):
    label = 1 if example['text'].startswith("__label__1") else 2
    text = example['text'][10:].strip()
    cleaned_text = clean_text(text)
    preprocessed_text = remove_stopwords_and_punct(cleaned_text)
    return {"label": label, "text": preprocessed_text}


In [None]:
# Apply RoBERTa model
from transformers import RobertaTokenizer
from torch.utils.data import Dataset

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class SentimentDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = list(examples)  # Convert generator to list to access its length
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        text = self.examples[idx]['text']
        label = self.examples[idx]['label']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

In [None]:
# DataLoader
from torch.utils.data import DataLoader

# Assuming preprocessed_stream is your preprocessed data
preprocessed_list = [preprocess_streamed_data(example) for example in train_streamed]  # Convert generator to list
dataset = SentimentDataset(preprocessed_list, tokenizer)

# Create the DataLoader for our training set
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
# Define Training Function

from transformers import AdamW
from tqdm import tqdm  # for displaying progress bar

def train(model, train_loader, optimizer, device):
    model.train()  # set model to training mode
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    # Calculate the average loss over all of the batches.
    average_loss = total_loss / len(train_loader)
    print(f"Training loss: {average_loss}")


In [None]:
# Preparing for training
# Initialize the AdamW optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # move model to the right device


In [None]:
# abstract negative sentiment is encoded as 0 and positive sentiment as 1

def preprocess_streamed_data(example):
    # Adjust labels to be in the range [0, 1]
    label = 0 if example['text'].startswith("__label__1") else 1  # Adjusted labels here
    text = example['text'][10:].strip()
    return {"label": label, "text": text}

# Then, you create the preprocessed list again with the adjusted labels
preprocessed_list = [preprocess_streamed_data(example) for example in train_streamed]


dataset = SentimentDataset(preprocessed_list, tokenizer)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
# Example code to set device in PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from torch.utils.data import Dataset

if __name__ == "__main__":
    epochs = 3  # example for 3 epochs, adjust as needed
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        train(model, train_loader, optimizer, device)

In [None]:
train_loader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0)


In [None]:
epochs = 3  # example for 3 epochs, adjust as needed

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train(model, train_loader, optimizer, device)


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.nn.functional import softmax


def evaluate(model, val_loader, device):
    model.eval()  # set model to evaluation mode
    
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Move logits and labels to CPU
            logits = outputs.logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            
            # Store predictions and true labels
            predictions.append(logits)
            true_labels.append(label_ids)
    
    # Convert outputs to predictions
    predictions = np.concatenate(predictions, axis=0)
    predictions = np.argmax(predictions, axis=1)
    true_labels = np.concatenate(true_labels, axis=0)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# Assume val_loader is already created and similar to train_loader but for validation data
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train(model, train_loader, optimizer, device)  # Training step
    metrics = evaluate(model, val_loader, device)  # Evaluation step
    print(f"Validation Accuracy: {metrics['accuracy']}")
    print(f"Validation F1 Score: {metrics['f1']}")
    print(f"Validation Precision: {metrics['precision']}")
    print(f"Validation Recall: {metrics['recall']}")


Loading a pre-trained RoBERTa model and its tokenizer, preparing your data in the format expected by the model, and then either fine-tuning the model on your dataset or using the model to make predictions directly.

In [None]:
#from transformers import RobertaTokenizer, RobertaForSequenceClassification
#from torch.utils.data import DataLoader, Dataset
#import torch
# Load the RoBERTa tokenizer and model
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2) 

Since RoBERTa expects raw text as input for its tokenizer to work correctly (because it handles special tokens and segmentation itself), you'll need to convert your tokens back into text strings before using the RoBERTa tokenizer.

In [None]:
# Preparing for dataset

In [None]:
#
# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

#class TokenizedReviewsDataset(Dataset):
 #   def __init__(self, tokenized_reviews, labels, tokenizer, max_length=512):
        self.tokenized_reviews = tokenized_reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.tokenized_reviews)
    
    def __getitem__(self, idx):
        # Convert list of tokens back to string
        review_text = " ".join(self.tokenized_reviews[idx])
        label = self.labels[idx]

        # Encode the review text
        encoding = self.tokenizer.encode_plus(
            review_text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',  # PyTorch tensors
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Convert the stream to lists for training
tokenized_reviews = []
labels = []

for example in preprocessed_text_stream:
    # Join tokens to form a single string (if your dataset preparation requires strings)
    review_text = " ".join(example['tokens'])
    tokenized_reviews.append(review_text)
    labels.append(example['label'])

# Now tokenized_reviews and labels are populated and can be used to create the dataset


# Create DataLoader for the Dataset
from torch.utils.data import DataLoader, random_split

# Assuming tokenized_reviews and labels are available
dataset_size = len(tokenized_reviews)
train_size = int(dataset_size * 0.8)
val_size = dataset_size - train_size

full_dataset = TokenizedReviewsDataset(tokenized_reviews, labels, tokenizer)
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
# Training loop
from transformers import AdamW
from torch.nn.functional import cross_entropy
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    # Validation step
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader)}, Val Loss: {total_val_loss / len(val_loader)}")


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = train_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

In [None]:
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

In [None]:
dataset_head = pubmed_dataset_streamed.take(5)
list(dataset_head)

In [None]:
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [None]:
law_dataset_streamed = load_dataset(
    "json",
    data_files="https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    split="train",
    streaming=True,
)
next(iter(law_dataset_streamed))