## Important library 

In [2]:
# Basic Libraries 
import pandas as pd
import numpy as np
from tqdm import tqdm
import bz2
import zipfile
import os

# NLTK libraries
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from sklearn.feature_extraction.text import TfidfVectorizer
#import spacy
#from tqdm.auto import tqdm

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

# Metric Libraries 
from sklearn.feature_extraction.text import CountVectorizer

# ML
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
#!pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

stop_words = stopwords.words('english')
#from gensim import corpora as corpora
#from transformers import AutoTokenizer, AutoModelForSequenceClassification

#from tensorflow.keras.preprocessing.sequence import pad_sequences
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense,LSTM,SpatialDropout1D,Embedding
#from keras.callbacks import ModelCheckpoint

#!pip install transformers
#!pip install torch
#!pip install torch torchvision torchaudio
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch


  from pandas.core import (


## 2. Important dataset

In [42]:
#!pip install zstandard
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "/Users/szuyingpan/Desktop/NLP/CW1/train.ft.txt"
train = load_dataset("text", data_files=data_files, split="train")
train

Dataset({
    features: ['text'],
    num_rows: 3600000
})

In [43]:
# sample
train[0]

{'text': '__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}

In [46]:
#!pip install psutil
import psutil

In [44]:
# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 790.50 MB


In [45]:
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "/Users/szuyingpan/Desktop/NLP/CW1/test.ft.txt"
test = load_dataset("text", data_files=data_files, split="train")
test

Dataset({
    features: ['text'],
    num_rows: 400000
})

In [47]:
# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 797.16 MB


In [None]:
# the rss attribute refers to the resident set size, which is the fraction of memory that a process occupies 
#in RAM. This measurement also includes the memory used by the Python interpreter and the libraries weâ€™ve 
#loaded, so the actual amount of memory used to load the dataset is a bit smaller. 

In [49]:
# The size of train dataset
print(f"Number of files in dataset : {train.dataset_size}")
size_gb = train.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

Number of files in dataset : 1607964432
Dataset size (cache file) : 1.50 GB


In [50]:
# The size of test dataset
print(f"Number of files in dataset : {test.dataset_size}")
size_gb = test.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

Number of files in dataset : 178576193
Dataset size (cache file) : 0.17 GB


In [51]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(train), batch_size):
    _ = train[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(train)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

Iterated over 3600000 examples (about 0.2 GB) in 5.7s, i.e. 0.029 GB/s


In [52]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(test), batch_size):
    _ = test[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(test)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

Iterated over 400000 examples (about 0.2 GB) in 0.7s, i.e. 0.249 GB/s


In [53]:
# Iterate over it one example at a time without loading the entire dataset into memory.
train_streamed = load_dataset(
    "text", data_files=data_files, split="train", streaming=True
)

In [55]:
next(iter(train_streamed))

{'text': '__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"'}

To work as sentiment analysis, we need to preprocess the text to extract the label and review text separately and then apply further preprocessing like tokenization or stop words removal as needed.

In [61]:
def preprocess_streamed_data(example):
    # Extract label (assuming label 1 is negative and label 2 is positive)
    label = 1 if example['text'].startswith("__label__1") else 2
    # Remove the label from the text and any leading/trailing whitespace
    text = example['text'][10:].strip()
    return {"label": label, "text": text}

# Use a generator expression to apply preprocessing
preprocessed_stream = (preprocess_streamed_data(example) for example in train_streamed)

# Example: Access the first preprocessed example
print(next(preprocessed_stream))

{'label': 2, 'text': 'Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"'}


In [65]:
#nltk.download('punkt')
#nltk.download('stopwords')

# Customize the stop words list: keep "not" is essential for sentiment analysis
stop_words = set(stopwords.words('english'))
stop_words.remove('not')

def clean_text(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    return text

def tokenize_and_remove_stopwords_and_punctuation(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lowercase, remove stop words and punctuation
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return filtered_tokens

def preprocess_text(example):
    # Clean text
    cleaned_text = clean_text(example['text'])
    # Tokenize and remove stop words
    tokens = tokenize_and_remove_stopwords(cleaned_text)
    return {"label": example['label'], "tokens": tokens}

# Apply preprocessing to the streamed data
preprocessed_text_stream = (preprocess_text(example) for example in preprocessed_stream)

# Example: Access the first preprocessed text example
print(next(preprocessed_text_stream))


{'label': 1, 'tokens': ['batteries', 'died', 'within', 'year', '...', 'bought', 'charger', 'jul', '2003', 'worked', 'ok', 'design', 'nice', 'convenient', 'however', 'year', 'batteries', 'would', 'not', 'hold', 'charge', 'might', 'well', 'get', 'alkaline', 'disposables', 'look', 'elsewhere', 'charger', 'comes', 'batteries', 'better', 'staying', 'power']}


Loading a pre-trained RoBERTa model and its tokenizer, preparing your data in the format expected by the model, and then either fine-tuning the model on your dataset or using the model to make predictions directly.

In [None]:
# Load the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2) 

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Since RoBERTa expects raw text as input for its tokenizer to work correctly (because it handles special tokens and segmentation itself), you'll need to convert your tokens back into text strings before using the RoBERTa tokenizer.

In [None]:
# Preparing for dataset

In [2]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from torch.utils.data import DataLoader, Dataset
import torch

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class TokenizedReviewsDataset(Dataset):
    def __init__(self, tokenized_reviews, labels, tokenizer, max_length=512):
        self.tokenized_reviews = tokenized_reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.tokenized_reviews)
    
    def __getitem__(self, idx):
        # Convert list of tokens back to string
        review_text = " ".join(self.tokenized_reviews[idx])
        label = self.labels[idx]

        # Encode the review text
        encoding = self.tokenizer.encode_plus(
            review_text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',  # PyTorch tensors
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [62]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = train_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

{'text': '__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"',
 'input_ids': [101,
  1035,
  1035,
  3830,
  1035,
  1035,
  1016,
  2307,
  3729,
  1024,
  2026,
  8403,
  6986,
  2038,
  2028,
  1997,
  1996,
  2307,
  5755,
  1997,
  2014,
  4245,
  1012,
  1045,
  2031,
  7791,
  2000,
  2023,
  3729,
  2005,
  2086,
  1998,
  1045,
  2145,
  2293,
  2009,
  1012,
  2043,
  1045,
  1005,
  1049,
  1999,
  1037,
  2204,
  6888,
  2009,
  3084,
  2033,
  2514,
  2488,
  1012,
  1037,
  2919,
  688

In [63]:
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

{'text': "__label__1 How Can Sex and Sin Be So Boring?: Throughout this entire book, one part of my brain was in a state of wonder, trying to imagine how anyone could make a book about scandal, sin, adultery, public shame and cowardice so dull. To finish the book was a test of will, and was accomplished because I hate to leave things undone.There are many fine, engaging, interesting novels, both of our age and of Hawthorne's. I can't imagine why one would want to slog through this one."}

In [34]:
dataset_head = pubmed_dataset_streamed.take(5)
list(dataset_head)

[{'text': '__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'},
 {'text': "__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."},
 {'text': '__

In [35]:
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [36]:
law_dataset_streamed = load_dataset(
    "json",
    data_files="https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    split="train",
    streaming=True,
)
next(iter(law_dataset_streamed))

FileNotFoundError: Unable to find 'https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst'

In [None]:
# Constants
BATCH_SIZE = 256
MAX_FEATURES = 1000
EMBEDDING_DIM = 120
CHUNK_SIZE = 50000  # Adjust based on memory availability