In [1]:
!pip install -q datasets wandb

In [2]:
! pip install -q transformers sentencepiece accelerate -U

In [3]:
# ! wandb login

In [4]:
from datasets import load_dataset

sentiment_data = load_dataset("carblacac/twitter-sentiment-analysis")

Downloading builder script:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading and preparing dataset new_dataset/default (download: 15.77 MiB, generated: 16.65 MiB, post-processed: Unknown size, total: 32.42 MiB) to /root/.cache/huggingface/datasets/carblacac___new_dataset/default/1.0.0/cd65e23e456de6a4f7264e305380b0ffe804d6f5bfd361c0ec0f68d8d1fab95b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-6351f243c2eb2aaa/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-6351f243c2eb2aaa/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/149985 [00:00<?, ?ex/s]

  0%|          | 0/61998 [00:00<?, ?ex/s]

Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Generating train split:   0%|          | 0/119988 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29997 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/61998 [00:00<?, ? examples/s]

Dataset new_dataset downloaded and prepared to /root/.cache/huggingface/datasets/carblacac___new_dataset/default/1.0.0/cd65e23e456de6a4f7264e305380b0ffe804d6f5bfd361c0ec0f68d8d1fab95b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset,DataLoader
import pandas as pd
import os
import wandb

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
sentiment_data

DatasetDict({
    train: Dataset({
        features: ['text', 'feeling'],
        num_rows: 119988
    })
    validation: Dataset({
        features: ['text', 'feeling'],
        num_rows: 29997
    })
    test: Dataset({
        features: ['text', 'feeling'],
        num_rows: 61998
    })
})

In [8]:
train_data = pd.DataFrame(sentiment_data['train'],columns=['text','feeling'])
test_data = pd.DataFrame(sentiment_data['test'],columns=['text','feeling'])
val_data = pd.DataFrame(sentiment_data['validation'],columns=['text','feeling'])

In [9]:
train_data.to_csv("train.csv",index=False)
test_data.to_csv("test.csv",index=False)
val_data.to_csv("val.csv",index=False)

#### Remove HTML Tags

In [10]:
import re
def remove_html_tags(text):
  pattern = re.compile("<.*?>")
  return pattern.sub(r'',text)

train_data['text'] = train_data['text'].apply(remove_html_tags)
test_data['text'] = test_data['text'].apply(remove_html_tags)
val_data['text'] = val_data['text'].apply(remove_html_tags)

#### Remove punctuations

In [11]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def remove_punctuations(text):
  for char in string.punctuation:
    text = text.replace(char,'')
  return text

train_data['text'] = train_data['text'].apply(remove_punctuations)
test_data['text'] = test_data['text'].apply(remove_punctuations)
val_data['text'] = val_data['text'].apply(remove_punctuations)

#### Remove URLs

In [13]:
def remove_urls(text):
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  urls = url_pattern.findall(text)

  for url in urls:
      text = text.replace(url, "")

  return text

train_data['text'] = train_data['text'].apply(remove_urls)
test_data['text'] = test_data['text'].apply(remove_urls)
val_data['text'] = val_data['text'].apply(remove_urls)

#### Dataset prepration and tokenization

In [14]:
class SentimentDataset(Dataset):
  def __init__(self,texts,labels,tokenizer,max_length):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, index):
    text = self.texts[index]
    label = self.labels[index]
    encoding = self.tokenizer(text,max_length=self.max_length,padding="max_length",truncation=True)
    return {'input_ids':torch.tensor(encoding['input_ids']).flatten(),'attention_maxk':torch.tensor(encoding['attention_mask']).flatten(),'labels':torch.tensor(label,dtype=torch.long)}

#### Model and Tokenizer import

In [15]:
from transformers import BertForSequenceClassification,BertTokenizer
bertModel = BertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [16]:
train_dataset = SentimentDataset(train_data['text'],train_data['feeling'],tokenizer,max_length=128)
test_dataset = SentimentDataset(test_data['text'],test_data['feeling'],tokenizer,max_length=128)
val_dataset = SentimentDataset(val_data['text'],val_data['feeling'],tokenizer,max_length=128)

#### Not needed to convert into DataLoader format

In [17]:
# train_dataloader = DataLoader(train_dataset,shuffle=True,batch_size=32,num_workers=2,pin_memory=True)
# test_dataloader = DataLoader(test_dataset,shuffle=True,batch_size=32,num_workers=2,pin_memory=True)
# val_dataloader = DataLoader(val_dataset,shuffle=True,batch_size=32,num_workers=2,pin_memory=True)

#### Another process for converting the data

In [18]:
def process_function(sample,padding="max_length"):

  #tokeniztion
  model_inputs = tokenizer(sample['text'],max_length=128,padding=padding,truncation=True)

  # tokenize the target
  labels = sample['feeling']

  # if padding=="max_length":
  #   labels["input_ids"] = [
  #       [(l if l !=tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
  #   ]

  model_inputs["labels"] = labels

  # return model_inputs

In [19]:
# train_dataset = sentiment_data['train'].map(process_function,batched=True,remove_columns=['text','feeling'])
# test_dataset = sentiment_data['test'].map(process_function,batched=True,remove_columns=['text','feeling'])
# val_dataset = sentiment_data['validation'].map(process_function,batched=True,remove_columns=['text','feeling'])

In [20]:
# train_dataset['labels'][1]

In [21]:
from transformers import Trainer,TrainingArguments

2024-04-02 05:22:11.524384: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-02 05:22:11.524495: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-02 05:22:11.683836: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [22]:
# data_eg = next(iter(test_dataloader))
# # print(data_eg['input_ids'])
# print(bertModel(data_eg['input_ids'].to(device))[1])

In [23]:
bertModel

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [24]:
# demo_model = bertModel
# demo_model.pooler = nn.Sequential(*list(demo_model.pooler.children())[:-1])
# demo_model

In [25]:
class BertModelModified(nn.Module):
  def __init__(self,baseModel):
    super().__init__()
    self.baseModel = baseModel
    self.baseModel.pooler = nn.Sequential(nn.Linear(in_features=768,out_features=1),nn.Tanh())

  def forward(self,x):
    x = self.baseModel(x)
    # x = self.baseModel(x)

    return x

newBERTModel = BertModelModified(bertModel)
newBERTModel = newBERTModel.to(device)
print(newBERTModel.parameters)

<bound method Module.parameters of BertModelModified(
  (baseModel): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense):

In [26]:
# print(len(data_eg['input_ids']))
# print(len(data_eg['labels']))

In [27]:
# print(newBERTModel(data_eg['input_ids']).max(1)[1])

In [28]:
m = nn.Sigmoid()
input = torch.randn(2)
output = m(input)
print(input)

tensor([ 0.8164, -0.2300])


In [29]:
def Training(model,train_dataloader,test_dataloader,epochs=10):
  optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
  loss_fn = nn.NLLLoss()

  for epoch in range(epochs):
    for data in train_dataloader:
      context = data['input_ids']
      target = data['labels']
      output = model(context)
      optimizer.zero_grad()
      loss = loss_fn(output.mean(1),target)
      loss.backward()
      optimizer.step()

    print(loss)


# Training(newBERTModel,train_dataloader,test_dataloader,epochs=5)

In [30]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100
dataCollator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = bertModel,
    label_pad_token_id = label_pad_token_id,
    pad_to_multiple_of = 8,
)

In [31]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="my-awesome-project"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [32]:
training_args = TrainingArguments(
    output_dir = '/bertModel',
    report_to="wandb",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    fp16 = False,
    learning_rate = 1e-4,
    num_train_epochs = 3,
    logging_strategy = "steps",
    logging_steps = 100,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 2,
    load_best_model_at_end= True,
    push_to_hub = False
)

trainer = Trainer(
    model = bertModel,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    # data_collator = dataCollator,
    args = training_args
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [33]:
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
1,0.6982,0.701754
2,0.6938,0.693572
3,0.693,0.693151


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


VBox(children=(Label(value='417.680 MB of 417.680 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▁▁
eval/runtime,▁█▄
eval/samples_per_second,█▁▅
eval/steps_per_second,█▁▅
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▄▇▆▇▂▃█▃▇▄▅▃▃▄▂▂▄▂▂▂▃▂▂▂▂▂▁▃▂▁▂▂▂▃▂▃▂▂▁▃
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▄▆▄▃▄▃▄▄▂▃▂▃▄▂▁▂▂▁▂▁▂▂▂▂▂▁▁▁▁▂▂▁▁▁▁▁▁▁▁

0,1
eval/loss,0.69315
eval/runtime,397.8503
eval/samples_per_second,155.832
eval/steps_per_second,9.74
total_flos,2.367783957476045e+16
train/epoch,3.0
train/global_step,22500.0
train/grad_norm,2.36453
train/learning_rate,0.0
train/loss,0.693


#### CSV to HuggingFace dataset again

In [34]:
from datasets import Dataset

train_ds = Dataset.from_pandas(pd.concat([train_data,test_data],ignore_index=True)).train_test_split(0.2)

In [35]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'feeling'],
        num_rows: 145588
    })
    test: Dataset({
        features: ['text', 'feeling'],
        num_rows: 36398
    })
})

In [36]:
trainer.save_model("bertModel")

In [40]:
from IPython.display import FileLink
FileLink(r"bertModel/model.safetensors")