In [1]:
!pip install transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 8.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 28.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 9.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstall

In [2]:
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import transformers
import torch

In [8]:
class CommentDataset(Dataset):
  def __init__(self, dataset_path, dataset_name):
    self.dataset_path = dataset_path
    self.dataset_name = dataset_name
    self.read_dataset()
    self.set_tokenizer()
    self.set_max_length()
    self.tokenize_dataset()

  def set_max_length(self):
    self.max_length = 512

  def set_tokenizer(self):
    self.tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')

  def read_dataset(self):
      if '.csv' in self.dataset_path:
        self.original_df = pd.read_csv(self.dataset_path, encoding = 'utf-8')

      self.original_df = self.original_df.dropna()
      #self.original_df = self.original_df[:10] # ino bardarin

      one_label = self.original_df[self.original_df.label == 1]
      zero_label = self.original_df[self.original_df.label == 0]

      one_label_length = len(one_label)
      zero_label_length = len(zero_label)

      if self.dataset_name == 'train':
        cut_one = one_label[:int(80/100*one_label_length)]
        cut_zero = zero_label[:int(80/100*zero_label_length)]

      else: # val
        cut_one = one_label[int(80/100*one_label_length):]
        cut_zero = zero_label[int(80/100*zero_label_length):]

      data = [cut_one, cut_zero]
      self.original_df = pd.concat(data)

      self.comments = self.original_df['commenttext'].tolist()
      self.labels = self.original_df['label'].tolist()

      assert (len(self.comments) == len(self.labels)), 'dataset rows are not equal.'

      self.dataset_size = len(self.comments)

  def tokenize_dataset(self):
    tokenized = self.tokenizer(
      self.comments, 
      return_tensors = 'pt',
      max_length = self.max_length,
      truncation = True,
      padding = 'max_length')
    
    self.input_ids = tokenized['input_ids']
    self.attention_mask = tokenized['attention_mask']

  def __len__(self):
    return self.dataset_size

  def __getitem__(self, idx):
    inp = {
        'input_ids': self.input_ids[idx],
        'attention_mask': self.input_ids[idx]
    }
    return inp, self.labels[idx]



In [9]:
class CommentModel(torch.nn.Module):

  def __init__(self):
    super().__init__()

    self.bert = transformers.AutoModel.from_pretrained('bert-base-uncased')
    self.linear = torch.nn.Linear(768, 1)
    self.sigmoid = torch.nn.Sigmoid()

  def forward(self, x):
    enc = self.bert(**x)
    logits = self.linear(enc['pooler_output'])
    probs = self.sigmoid(logits)

    return probs


In [10]:
def train():
  model.train()

  for index, batch in enumerate(train_dataloader):
    Xs, y = batch
    probs = model(Xs)
    loss = loss_fn(probs.squeeze().type(torch.FloatTensor), y.type(torch.FloatTensor))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (index + 1) % 1 == 0:
      print(f'done {index + 1} from {len(train_dataloader)}')
    


In [11]:
def evaluate():
  model.eval()
  all_preds, all_labels = [], []

  for index, batch in enumerate(val_dataloader):
    Xs, y = batch
    probs = model(Xs)

    preds = (probs > 0.5) * 1
    # print(preds.squeeze().tolist(), y.tolist())
    all_preds += preds.squeeze().tolist()
    all_labels += y.tolist()

  acc = len([0 for i in range(len(all_preds)) if all_preds[i] == all_labels[i]]) * 100. / len(all_preds)

  print(acc)

In [13]:
train_dataset = CommentDataset('cleaned.csv', 'train')
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataset = CommentDataset('cleaned.csv', 'val')
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)
model = CommentModel()

epochs = 5
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

for epoch in range(epochs):
  train()
  evaluate()

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


done 1 from 100
done 2 from 100
done 3 from 100
done 4 from 100
done 5 from 100
done 6 from 100
done 7 from 100
done 8 from 100
done 9 from 100
done 10 from 100
done 11 from 100
done 12 from 100
done 13 from 100
done 14 from 100
done 15 from 100
done 16 from 100
done 17 from 100
done 18 from 100
done 19 from 100
done 20 from 100
done 21 from 100
done 22 from 100
done 23 from 100
done 24 from 100
done 25 from 100
done 26 from 100
done 27 from 100
done 28 from 100
done 29 from 100
done 30 from 100
done 31 from 100
done 32 from 100
done 33 from 100
done 34 from 100
done 35 from 100
done 36 from 100
done 37 from 100
done 38 from 100
done 39 from 100
done 40 from 100
done 41 from 100
done 42 from 100
done 43 from 100
done 44 from 100
done 45 from 100
done 46 from 100
done 47 from 100
done 48 from 100
done 49 from 100
done 50 from 100
done 51 from 100
done 52 from 100
done 53 from 100
done 54 from 100
done 55 from 100
done 56 from 100
done 57 from 100
done 58 from 100
done 59 from 100
done 6

In [14]:
len(val_dataset)

200

In [16]:
d

{'attention_mask': tensor([  101, 10338,  6342,  9102,  2077,  2017, 18138,  2105,  2006,  2026,
          2147,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     

In [17]:
for i in range(8):
  d = train_dataset[i][0]
  tok = train_dataset.tokenizer
  print(max(d['input_ids']))

tensor(18138)
tensor(29446)
tensor(10055)
tensor(29050)
tensor(18294)
tensor(26668)
tensor(21746)
tensor(22052)
