## Important library 

In [6]:
# Basic Libraries 
import pandas as pd
import numpy as np
from tqdm import tqdm
import bz2
import zipfile
import os

# NLTK libraries
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from sklearn.feature_extraction.text import TfidfVectorizer
#import spacy
#from tqdm.auto import tqdm

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
#from wordcloud import WordCloud, STOPWORDS

# Metric Libraries 
from sklearn.feature_extraction.text import CountVectorizer

# ML
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
#!pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

stop_words = stopwords.words('english')
#from gensim import corpora as corpora
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#from tensorflow.keras.preprocessing.sequence import pad_sequences
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense,LSTM,SpatialDropout1D,Embedding
#from keras.callbacks import ModelCheckpoint

#!pip install transformers
#!pip install torch
#!pip install torch torchvision torchaudio
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch


  from pandas.core import (


## 2. Important dataset

In [7]:
#!pip install zstandard
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "/Users/szuyingpan/Desktop/NLP/CW1/train.ft.txt"
train = load_dataset("text", data_files=data_files, split="train")
train

Dataset({
    features: ['text'],
    num_rows: 3600000
})

In [8]:
# sample
train[0]

{'text': '__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}

In [9]:
#!pip install psutil
import psutil

In [10]:
# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 729.86 MB


In [11]:
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "/Users/szuyingpan/Desktop/NLP/CW1/test.ft.txt"
test = load_dataset("text", data_files=data_files, split="train")
test

Dataset({
    features: ['text'],
    num_rows: 400000
})

In [12]:
# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 735.31 MB


In [13]:
# the rss attribute refers to the resident set size, which is the fraction of memory that a process occupies 
#in RAM. This measurement also includes the memory used by the Python interpreter and the libraries we’ve 
#loaded, so the actual amount of memory used to load the dataset is a bit smaller. 

In [14]:
# The size of train dataset
print(f"Number of files in dataset : {train.dataset_size}")
size_gb = train.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

Number of files in dataset : 1607964432
Dataset size (cache file) : 1.50 GB


In [15]:
# The size of test dataset
print(f"Number of files in dataset : {test.dataset_size}")
size_gb = test.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

Number of files in dataset : 178576193
Dataset size (cache file) : 0.17 GB


In [16]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(train), batch_size):
    _ = train[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(train)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

Iterated over 3600000 examples (about 0.2 GB) in 5.3s, i.e. 0.031 GB/s


In [17]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(test), batch_size):
    _ = test[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(test)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

Iterated over 400000 examples (about 0.2 GB) in 0.5s, i.e. 0.313 GB/s


In [18]:
# Iterate over it one example at a time without loading the entire dataset into memory.
train_streamed = load_dataset(
    "text", data_files=data_files, split="train", streaming=True
)

In [19]:
next(iter(train_streamed))

{'text': '__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"'}

In [48]:
NUMBER_OF_LINES = 2000

data = {}

# read the archived file line by line, add add it to a map
for i, line in enumerate("train_streamed", "rt", encoding="utf8")):

    if i == NUMBER_OF_LINES:
        break

    # label 1 is negativ and label 2 is positive
    label = 1 if line[:10] == "__label__1" else 2
    text = line[10:]

    localResult = {
        "label": label,
        "text": text
    }

    data[i] = localResult


df = pd.DataFrame(data).T
df = df.reset_index().rename(columns= {"index": "Id"})

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 18)

To work as sentiment analysis, we need to preprocess the text to extract the label and review text separately and then apply further preprocessing like tokenization or stop words removal as needed.

In [20]:
def preprocess_streamed_data(example):
    # Extract label (assuming label 1 is negative and label 2 is positive)
    label = 1 if example['text'].startswith("__label__1") else 2
    # Remove the label from the text and any leading/trailing whitespace
    text = example['text'][10:].strip()
    return {"label": label, "text": text}

# Use a generator expression to apply preprocessing
preprocessed_stream = (preprocess_streamed_data(example) for example in train_streamed)

# Example: Access the first preprocessed example
print(next(preprocessed_stream))

{'label': 2, 'text': 'Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"'}


In [21]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Assuming NLTK data has been downloaded
stop_words = set(stopwords.words('english'))
stop_words.remove('not')  # Keep "not" for sentiment analysis

def clean_text(text):
    """Remove URLs, HTML tags, and punctuation from text."""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    return text

def remove_stopwords_and_punct(text):
    """Tokenize text and remove stopwords and punctuation, return as a single string."""
    tokens = word_tokenize(text)
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return " ".join(filtered_tokens)

def preprocess_streamed_data(example):
    label = 1 if example['text'].startswith("__label__1") else 2
    text = example['text'][10:].strip()
    cleaned_text = clean_text(text)
    preprocessed_text = remove_stopwords_and_punct(cleaned_text)
    return {"label": label, "text": preprocessed_text}


In [22]:
# Apply RoBERTa model
from transformers import RobertaTokenizer
from torch.utils.data import Dataset

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class SentimentDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = list(examples)  # Convert generator to list to access its length
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        text = self.examples[idx]['text']
        label = self.examples[idx]['label']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

In [23]:
# DataLoader
from torch.utils.data import DataLoader

# Assuming preprocessed_stream is your preprocessed data
preprocessed_list = [preprocess_streamed_data(example) for example in train_streamed]  # Convert generator to list
dataset = SentimentDataset(preprocessed_list, tokenizer)

# Create the DataLoader for our training set
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [29]:
# Define Training Function

from transformers import AdamW
from tqdm import tqdm  # for displaying progress bar

def train(model, train_loader, optimizer, device):
    model.train()  # set model to training mode
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    # Calculate the average loss over all of the batches.
    average_loss = total_loss / len(train_loader)
    print(f"Training loss: {average_loss}")


In [30]:
# Preparing for training
# Initialize the AdamW optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # move model to the right device




RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [44]:
# abstract negative sentiment is encoded as 0 and positive sentiment as 1

def preprocess_streamed_data(example):
    # Adjust labels to be in the range [0, 1]
    label = 0 if example['text'].startswith("__label__1") else 1  # Adjusted labels here
    text = example['text'][10:].strip()
    return {"label": label, "text": text}

# Then, you create the preprocessed list again with the adjusted labels
preprocessed_list = [preprocess_streamed_data(example) for example in train_streamed]


dataset = SentimentDataset(preprocessed_list, tokenizer)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [45]:
# Example code to set device in PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [46]:
from torch.utils.data import Dataset

if __name__ == "__main__":
    epochs = 3  # example for 3 epochs, adjust as needed
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        train(model, train_loader, optimizer, device)

Epoch 1/3


Training:   0%|          | 49/50000 [06:10<105:02:03,  7.57s/it]


KeyboardInterrupt: 

In [None]:
train_loader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0)


In [41]:
epochs = 3  # example for 3 epochs, adjust as needed

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train(model, train_loader, optimizer, device)


Epoch 1/3


Training:   0%|          | 0/50000 [00:00<?, ?it/s]Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/szuyingpan/anaconda3/envs/PythonTest/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/szuyingpan/anaconda3/envs/PythonTest/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'SentimentDataset' on <module '__main__' (built-in)>
Training:   0%|          | 0/50000 [02:38<?, ?it/s]


KeyboardInterrupt: 

In [32]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.nn.functional import softmax


def evaluate(model, val_loader, device):
    model.eval()  # set model to evaluation mode
    
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Move logits and labels to CPU
            logits = outputs.logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            
            # Store predictions and true labels
            predictions.append(logits)
            true_labels.append(label_ids)
    
    # Convert outputs to predictions
    predictions = np.concatenate(predictions, axis=0)
    predictions = np.argmax(predictions, axis=1)
    true_labels = np.concatenate(true_labels, axis=0)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# Assume val_loader is already created and similar to train_loader but for validation data
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train(model, train_loader, optimizer, device)  # Training step
    metrics = evaluate(model, val_loader, device)  # Evaluation step
    print(f"Validation Accuracy: {metrics['accuracy']}")
    print(f"Validation F1 Score: {metrics['f1']}")
    print(f"Validation Precision: {metrics['precision']}")
    print(f"Validation Recall: {metrics['recall']}")


Loading a pre-trained RoBERTa model and its tokenizer, preparing your data in the format expected by the model, and then either fine-tuning the model on your dataset or using the model to make predictions directly.

In [24]:
#from transformers import RobertaTokenizer, RobertaForSequenceClassification
#from torch.utils.data import DataLoader, Dataset
#import torch
# Load the RoBERTa tokenizer and model
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2) 

Since RoBERTa expects raw text as input for its tokenizer to work correctly (because it handles special tokens and segmentation itself), you'll need to convert your tokens back into text strings before using the RoBERTa tokenizer.

In [25]:
# Preparing for dataset

In [27]:
#
# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

#class TokenizedReviewsDataset(Dataset):
 #   def __init__(self, tokenized_reviews, labels, tokenizer, max_length=512):
        self.tokenized_reviews = tokenized_reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.tokenized_reviews)
    
    def __getitem__(self, idx):
        # Convert list of tokens back to string
        review_text = " ".join(self.tokenized_reviews[idx])
        label = self.labels[idx]

        # Encode the review text
        encoding = self.tokenizer.encode_plus(
            review_text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',  # PyTorch tensors
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 12)

In [None]:
# Convert the stream to lists for training
tokenized_reviews = []
labels = []

for example in preprocessed_text_stream:
    # Join tokens to form a single string (if your dataset preparation requires strings)
    review_text = " ".join(example['tokens'])
    tokenized_reviews.append(review_text)
    labels.append(example['label'])

# Now tokenized_reviews and labels are populated and can be used to create the dataset


# Create DataLoader for the Dataset
from torch.utils.data import DataLoader, random_split

# Assuming tokenized_reviews and labels are available
dataset_size = len(tokenized_reviews)
train_size = int(dataset_size * 0.8)
val_size = dataset_size - train_size

full_dataset = TokenizedReviewsDataset(tokenized_reviews, labels, tokenizer)
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
# Training loop
from transformers import AdamW
from torch.nn.functional import cross_entropy
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    # Validation step
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader)}, Val Loss: {total_val_loss / len(val_loader)}")


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = train_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

In [None]:
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

In [None]:
dataset_head = pubmed_dataset_streamed.take(5)
list(dataset_head)

In [None]:
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [None]:
law_dataset_streamed = load_dataset(
    "json",
    data_files="https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    split="train",
    streaming=True,
)
next(iter(law_dataset_streamed))