In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
cd /content/drive/MyDrive/MA

/content/drive/MyDrive/MA


In [None]:
!pip install transformers==4.2.0



In [None]:
from torchtext import data
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import time

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [None]:
data_csv = pd.read_csv('./labeled_data.csv', names=['label','tweet'],header=0)
#data_csv.loc[data_csv.label==0].sample(5)[['tweet', 'label']]

In [None]:
tweets = data_csv.tweet.values
labels = data_csv.label.values
print(len(labels))
print(len(tweets))

24783
24783


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
max_length = 0
for t in tweets:
  ids = tokenizer.encode(t)
  max_length = max(len(ids),max_length)
print(max_length)

481


In [None]:
input_ids = []
attention_masks = []

for t in tweets:
  input_dict = tokenizer.encode_plus(t,add_special_tokens=True,max_length=max_length, truncation=True, padding='max_length',return_tensors='pt')
  input_ids.append(input_dict['input_ids'])
  attention_masks.append(input_dict['attention_mask'])
input_ids = torch.cat(input_ids,dim=0)
attention_masks = torch.cat(attention_masks,dim=0)

In [None]:
#print(tweets[0])
#print(input_ids[0])
#print(attention_masks[0])
#print(labels[0])
labels=torch.tensor(labels)

In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [None]:
batch_size = 17

train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)

test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)

In [None]:
bert_model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 3)
bert_model = bert_model.to(DEVICE)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForSequenceClassification: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.

In [None]:
optimizer = AdamW(bert_model.parameters(),lr = 2e-5, eps = 1e-8)

In [None]:
epochs = 4

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
best_acc = 0
for e in range(epochs):
  print('training {} epoch...'.format(e+1))
  start_time = time.time()

  train_loss = 0

  bert_model.train(True)

  for batch in train_dataloader:
    input = batch[0].to(DEVICE)
    mask = batch[1].to(DEVICE)
    label = batch[2].to(DEVICE)

    bert_model.zero_grad()

    loss, output = bert_model(input_ids=input, attention_mask=mask, labels=label,return_dict=False)
    
    train_loss += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)
    
    optimizer.step()
    scheduler.step()

  num_total, num_correct = 0, 0
  bert_model.train(False)
  with torch.no_grad():
    eval_loss = 0

    for batch in test_dataloader:
      bert_model.zero_grad()

      input = batch[0].to(DEVICE)
      mask = batch[1].to(DEVICE)
      label = batch[2].to(DEVICE)

      loss, output = bert_model(input_ids=input, attention_mask=mask, labels=label,return_dict=False)

      predict_label = torch.argmax(output, dim=1)

      num_correct += (predict_label == label).sum().item()
      num_total += len(label)

      eval_loss += loss.item()

    acc = num_correct/num_total
    sec = time.time()-start_time
    if acc > best_acc:
      best_acc = acc
      torch.save(bert_model, 'model.bert')
  
  print('train_loss: {}, eval_loss: {}, accuracy: {}'.format(train_loss,eval_loss,acc))
  print('{} seconds used......'.format(sec))
      
    




training 1 epoch...
train_loss: 558.6435757577419, eval_loss: 52.7977653965354, accuracy: 0.8914885034288019
2274.631791830063 seconds used......
training 2 epoch...
train_loss: 413.4080745726824, eval_loss: 47.525754917412996, accuracy: 0.890278338039532
2274.6683661937714 seconds used......
training 3 epoch...
train_loss: 368.3461860753596, eval_loss: 45.689723927527666, accuracy: 0.8979427188382412
2274.705092191696 seconds used......
training 4 epoch...
train_loss: 327.3096639504656, eval_loss: 44.77511414140463, accuracy: 0.8967325534489714
2275.289024591446 seconds used......
