<a href="https://colab.research.google.com/github/Suarez94/Kaggle/blob/main/NLP%20with%20Disaster%20Tweets/NLP_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
from google.colab import drive
import os 
drive.mount('/content/drive', force_remount=True)
COLAB = True
print("Note: using Google CoLab")

Mounted at /content/drive
Note: using Google CoLab


In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import BertForSequenceClassification, AdamW, BertConfig # model
from transformers import BertTokenizer # tokenizer
from keras.preprocessing.sequence import pad_sequences # add padding
from sklearn.model_selection import train_test_split # split dataset for train and test
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler # create data batches
from transformers import get_linear_schedule_with_warmup # schedule for training BERT (updating weights etc)
import time
import datetime
import random
import re
import string
import tensorflow as tf

def time_elapsed(sec):
  h = int(sec/3600)
  m = int(sec/60)
  s = sec % 60
  return "{}:{:>02}:{:>05.2f}".format(h,m,s)

In [41]:
FOLDER_PATH = '/content/drive/My Drive/projects/NLP with Disaster Tweets'
train = pd.read_csv(os.path.join(FOLDER_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(FOLDER_PATH, 'test.csv'))
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## 1.0 Data cleaning

In [42]:
# lowering case
def lower_case(text):
    return text.lower()
train.text=train.text.apply(lambda x: lower_case(x))
test.text=test.text.apply(lambda x: lower_case(x))

def clean_text(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

train.text=train.text.apply(lambda x: clean_text(x))
test.text=test.text.apply(lambda x: clean_text(x))

# in this case i wont use keyword or location, only tweets

## 2.0 Data engineering

In [40]:
# fill NaN
def fill_cols(df):
  for col in df.columns:
      df[col] = df[col].fillna('None')
  return df

train = fill_cols(train)
test = fill_cols(test)

# Add "location" to text
train.text = 'location '+train.location + '. ' + train.text
test.text = 'location '+test.location + '. ' + test.text

In [43]:
texts = train.text.values
labels = train.target.values

In [None]:
# select gpu

device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('GPU in use:', torch.cuda.get_device_name(0))
else:
    print('CPU in use')
    device = torch.device("cpu")


## 3.0 Loading model and tokenizer


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification. 
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.cuda() # run on gpu

## 3.1 Format inputs

In [48]:
MAX_LEN = 64
batch_size = 32

# need to format inputs 
#  1.Add additional needed tokens
input_ids = []
for text in texts:
    encoded_text = tokenizer.encode(
                        text,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    input_ids.append(encoded_text)

#  2.every input must be the same length, but descriptions are different so me must add padding (adding token id0 to shorter inputs)


input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

#  3. Creating attention masks
attention_masks = []
for text in input_ids:
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in text]
    attention_masks.append(att_mask)

#  4. Split dataset (masks and inputs must match each other)
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=44, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=44, test_size=0.1)

# Converting inputs and outputs into pyTorch tensors (becouse Bert is implemented in pyTorch)
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## 3.2. Training

In [51]:
epochs = 10
learning_rate = 5e-5
epsilon = 1e-8

optimizer = AdamW(model.parameters(),
                  lr = learning_rate, 
                  eps =epsilon
                )

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [52]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = time_elapsed(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        

        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)            
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(time_elapsed(time.time() - t0)))
        

    #Validation

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in validation_dataloader:
        
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(time_elapsed(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of    215.    Elapsed: 0:00:14.55.
  Batch    80  of    215.    Elapsed: 0:00:29.41.
  Batch   120  of    215.    Elapsed: 0:00:44.08.
  Batch   160  of    215.    Elapsed: 0:00:58.53.
  Batch   200  of    215.    Elapsed: 0:01:12.82.

  Average training loss: 0.26
  Training epcoh took: 0:01:17.91

Running Validation...
  Accuracy: 0.83
  Validation took: 0:00:02.83

Training...
  Batch    40  of    215.    Elapsed: 0:00:14.34.
  Batch    80  of    215.    Elapsed: 0:00:28.82.
  Batch   120  of    215.    Elapsed: 0:00:43.34.
  Batch   160  of    215.    Elapsed: 0:00:57.82.
  Batch   200  of    215.    Elapsed: 0:01:12.24.

  Average training loss: 0.18
  Training epcoh took: 0:01:17.37

Running Validation...
  Accuracy: 0.82
  Validation took: 0:00:02.87

Training...
  Batch    40  of    215.    Elapsed: 0:00:14.36.
  Batch    80  of    215.    Elapsed: 0:00:28.78.
  Batch   120  of    215.    Elapsed: 0:00:43.22.
  Batch   160  of    215.    Elapsed: 0:0

## 4.0 Prediction

In [66]:
test_pred = test.text.values

input_ids = []

for text in test_pred:
    encoded_text = tokenizer.encode(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
    input_ids.append(encoded_text)

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

attention_masks = []

for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
batch_size = 1  

prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

predictions  = []

model.eval()
for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask = batch
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  predictions.append(logits)

In [100]:
pred_labels = np.zeros(len(predictions))
for i in range(len(predictions)):
  pred_labels[i] = (np.argmax(predictions[i], axis=1).flatten())
  
pred_labels = pred_labels.astype('int8')

In [99]:
submission = pd.DataFrame({
    "id": test.id, 
    "target": pred_labels
})
submission.to_csv(os.path.join(FOLDER_PATH, 'submission.csv'), index=False)

train = pd.read_csv(os.path.join(FOLDER_PATH, 'train.csv'))


0           0
1           2
2           3
3           9
4          11
        ...  
3258    10861
3259    10865
3260    10868
3261    10874
3262    10875
Name: id, Length: 3263, dtype: int64