<a href="https://colab.research.google.com/github/SuminBae97/PytorchStudy/blob/main/BERT_sentenceclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 30.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 28.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [3]:
import pandas as pd
from transformers import BertTokenizer
import torch
import numpy as np
path = '/content/drive/MyDrive/Colab Notebooks/bbc-text.csv'
df = pd.read_csv(path)


In [4]:
df.head(5)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [14]:
df.iloc[0].text

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also being built-in to high

In [None]:
df.groupby(['category']).size().plot.bar()

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [7]:
example_text = 'I will watch momentum tonight'
bert_input = tokenizer(example_text,padding='max_length',max_length=10,truncation=True, return_tensors='pt')

In [8]:
ex_text = tokenizer.decode(bert_input.input_ids[0])

In [9]:
ex_text

'[CLS] I will watch momentum tonight [SEP] [PAD] [PAD] [PAD]'

In [10]:
print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])

tensor([[  101,   146,  1209,  2824, 11550,  3568,   102,     0,     0,     0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])


In [21]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [16]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

1780 222 223


In [17]:
df_train

Unnamed: 0,category,text
414,politics,brown and blair face new rift claims for the u...
420,business,small firms hit by rising costs rising fuel ...
1644,entertainment,spirit awards hail sideways the comedy sideway...
416,tech,microsoft releases patches microsoft has warne...
1232,sport,arsenal through on penalties arsenal win 4-2 o...
...,...,...
801,sport,ireland 19-13 england ireland consigned englan...
1774,tech,warning over tsunami aid website net users are...
512,tech,digital guru floats sub-$100 pc nicholas negro...
633,entertainment,gallery unveils interactive tree a christmas t...


<h1>Bert modeling</h1>

In [18]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):
    
    def __init__(self,dropout=0.5):
        super(BertClassifier,self).__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        #fine tuning with classification layer
        self.linear = nn.Linear(768,5)
        self.relu = nn.ReLU()

    def forward(self,input_id,mask):
        _,pooled_output = self.bert(input_ids=input_id, attention_mask = mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        
        return final_layer



In [19]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                #y value
                train_label = train_label.to(device)

                mask = train_input['attention_mask'].to(device)
                
                input_id = train_input['input_ids'].squeeze(1).to(device)

                # model : bertClassifier()
                # model의 input은 input_id: [100,304,54..........]
                # attention_mask 
                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                   
                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
#train(model, df_train, df_val, LR, EPOCHS)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 890/890 [06:26<00:00,  2.31it/s]


Epochs: 1 | Train Loss:  0.749                 | Train Accuracy:  0.377                 | Val Loss:  0.507                 | Val Accuracy:  0.847


100%|██████████| 890/890 [06:27<00:00,  2.30it/s]


Epochs: 2 | Train Loss:  0.301                 | Train Accuracy:  0.927                 | Val Loss:  0.154                 | Val Accuracy:  0.991


100%|██████████| 890/890 [06:27<00:00,  2.30it/s]


Epochs: 3 | Train Loss:  0.116                 | Train Accuracy:  0.979                 | Val Loss:  0.076                 | Val Accuracy:  0.991


100%|██████████| 890/890 [06:29<00:00,  2.29it/s]


Epochs: 4 | Train Loss:  0.063                 | Train Accuracy:  0.988                 | Val Loss:  0.046                 | Val Accuracy:  0.991


100%|██████████| 890/890 [06:27<00:00,  2.29it/s]


Epochs: 5 | Train Loss:  0.037                 | Train Accuracy:  0.996                 | Val Loss:  0.035                 | Val Accuracy:  0.991


In [23]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, df_test)

Test Accuracy:  0.991


In [27]:
train= Dataset(df_train)




# train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
# val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

# use_cuda = torch.cuda.is_available()
# device = torch.device("cuda" if use_cuda else "cpu")

# criterion = nn.CrossEntropyLoss()
# optimizer = Adam(model.parameters(), lr= learning_rate)

# for train_input, train_label in tqdm(train_dataloader):

#     #y value
#     train_label = train_label.to(device)

#     mask = train_input['attention_mask'].to(device)
    
#     input_id = train_input['input_ids'].squeeze(1).to(device)



In [None]:
train.texts[0]

In [74]:
train.labels[0]

4

In [89]:
 train_dataloader = torch.utils.data.DataLoader(train, batch_size=3, shuffle=False)
 l_data = list(train_dataloader)

In [94]:

for x,y in train_dataloader:
    print('x',x)
    print(x['token_type_ids'].shape)
    print(x['input_ids'].shape)
    print(x['input_ids'][0].shape, y)
    print(x['input_ids'][1].shape)
    print(x['input_ids'][2].shape)
    i+=1

    if i==1:
        break


x {'input_ids': tensor([[[ 101, 3058, 1105,  ..., 4064, 1103,  102]],

        [[ 101, 1353, 9780,  ...,    0,    0,    0]],

        [[ 101, 4840, 3745,  ...,    0,    0,    0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]])}
torch.Size([3, 1, 512])
torch.Size([3, 1, 512])
torch.Size([1, 512]) tensor([4, 0, 1])
torch.Size([1, 512])
torch.Size([1, 512])
