In [1]:
#import packages

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
#Import the libraries
from sklearn.model_selection import train_test_split    # Splits arrays or matrices into random train and test subsets
from sklearn.model_selection import KFold               # Cross-validator
from sklearn.model_selection import cross_validate      # Evaluate metrics by cross-validation
from sklearn.model_selection import GridSearchCV        # Search over specified parameter values for an estimator
from sklearn.compose import ColumnTransformer           # Applies transformers to columns of DataFrames
from sklearn.pipeline import Pipeline                   # Helps building a chain of transforms and estimators
from sklearn.impute import SimpleImputer                # Imputation transformer for completing missing values
from sklearn.preprocessing import OneHotEncoder         # Encode categorical features

# Import the csv files

In [3]:
df_train=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", index_col='id')
df_test=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv", index_col='id')

The dataset provided has been split into train and test datasets. 
The train dataset has 7613 rows and 4 columns. The target column is part of the train dataset.
The test dataset has 3263 rows and 3 columns. the target column is not part of the test dataset, and needs to be predicted and submitted as part of the competition.

In [4]:
print("size of train dataset",df_train.shape)
print("size of test dataset",df_test.shape)

size of train dataset (7613, 4)
size of test dataset (3263, 3)


# Test train split

Our model is to be developed on the provided train dataset. So we further split train dataset into train and validation datasets to create and validate different models.

### Split the train dataset into predictor and response variables

In [5]:
X = df_train.iloc[:,:-1] #the predictor columns are all columns except the target column
y = df_train.iloc[:,-1:] #the target column is the last column 

In [6]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                                train_size=0.8, 
                                                                test_size=0.2, 
                                                                random_state=0)

In [7]:
print("size of X_train: predictors",X_train_full.shape)
print("size of X_valid: predictors",X_valid_full.shape)
print("size of y_train: response",y_train.shape)
print("size of y_valid: response",y_valid.shape)

size of X_train: predictors (6090, 3)
size of X_valid: predictors (1523, 3)
size of y_train: response (6090, 1)
size of y_valid: response (1523, 1)


# EDA

Exploratory analysis on our new training dataset

In [8]:
X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6090 entries, 1999 to 3924
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   6044 non-null   object
 1   location  4075 non-null   object
 2   text      6090 non-null   object
dtypes: object(3)
memory usage: 190.3+ KB


We see that location and keywords have some nulls.

## check the balance of the dataset

In [9]:
y_train.groupby(['target']).size()

target
0    3456
1    2634
dtype: int64

So we see that its a well balanced dataset.

## check examples of disaster and non disaster tweets

In [10]:
df_train[df_train['target']==0].head(10)

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23,,,What's up man?,0
24,,,I love fruits,0
25,,,Summer is lovely,0
26,,,My car is so fast,0
28,,,What a goooooooaaaaaal!!!!!!,0
31,,,this is ridiculous....,0
32,,,London is cool ;),0
33,,,Love skiing,0
34,,,What a wonderful day!,0
36,,,LOOOOOOL,0


In [11]:
df_train[df_train['target']==1].head(10)

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
8,,,#RockyFire Update => California Hwy. 20 closed...,1
10,,,#flood #disaster Heavy rain causes flash flood...,1
13,,,I'm on top of the hill and I can see a fire in...,1
14,,,There's an emergency evacuation happening now ...,1
15,,,I'm afraid that the tornado is coming to our a...,1


For a person reading the tweets, its quite easy to understand which is a disaster and which one is not.

# LSTM model

In [None]:
#import tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

We train our model on only the text column.

In [None]:
training_sentences = X_train_full['text']
validation_sentences = X_valid_full['text']
training_labels = y_train
validation_labels = y_valid

In [None]:
training_labels_final = np.array(training_labels)
validation_labels_final = np.array(validation_labels)

In [None]:
tokenizer = Tokenizer(num_words=100000, oov_token='<OOV>')
tokenizer.fit_on_texts(training_sentences)

tokenizer = Tokenizer(num_words=100000, oov_token='<OOV>')
tokenizer.fit_on_texts(validation_sentences)

In [None]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
pad_training = pad_sequences(training_sequences, maxlen=25, padding='post', truncating='post')

validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
pad_validation = pad_sequences(validation_sequences, maxlen=25, padding='post', truncating='post')

In [None]:
# create the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(100000, 16, input_length=20),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# fit the model
history = model.fit(pad_training, training_labels_final, epochs=15, validation_data=(pad_validation, validation_labels_final))

In [None]:
# plot the accuracy
import matplotlib.pyplot as plt
def plot_graph(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.show()

In [None]:
plot_graph(history, 'accuracy')
plot_graph(history, 'loss')

### Predict

In [None]:

test_sequences = tokenizer.texts_to_sequences(df_test.text)
pad_test = pad_sequences(test_sequences,maxlen=25, padding='post', truncating='post')

In [None]:
# create the submission files
submission=pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv", index_col='id')
prediction = model.predict(pad_test)
submission['target'] = (prediction>0.5).astype(int)

In [None]:
submission

In [12]:
# output the csv file
# submission.to_csv('submission1_lstm.csv', index=id, header=True)

# BERT

In [None]:
import pandas as pd
df_train=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", index_col='id')
df_test=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv", index_col='id')

Clean the dataset

In [None]:
import re
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = text.lower()  
    return text

In [None]:
df_train['clean_text'] = df_train.text.apply(clean_text)
df_test['clean_text'] = df_test.text.apply(clean_text)

In [None]:
df_test=df_test[['keyword','location','text','clean_text']]

In [None]:
df_test['target']=0

In [None]:
import torch
device = torch.device("cuda")

In [None]:
df_train=df_train[['keyword','location','text','clean_text','target']]

In [None]:
# Get the lists of sentences and their labels.
sentences = df_train.clean_text.values
labels = df_train.target.values

### Load Bert Base Uncased

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
input_ids = []

for sent in sentences:
    encoded_sent = tokenizer.encode(
                        sent
                   )
    input_ids.append(encoded_sent)

In [None]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

In [None]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 64
#Padding the input to the max length that is 64
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

In [None]:
# Creating the attention masks
attention_masks = []

for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

In [None]:
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2018, test_size=0.1)

In [None]:
#Converting the input data to the tensor , which can be fed to the model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#Creating the DataLoader which will help us to load data into the GPU/CPU
batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

### Load pretrained BERT model

In [None]:
#Loading the pre-trained BERT model from huggingface library

from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2,   
    output_attentions = False, 
    output_hidden_states = False, )

# Teeling the model to run on GPU
model.cuda()

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 4
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)
scheduler

In [None]:
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

### Model training

In [None]:
import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        

        loss = outputs[0]

        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():        

            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

In [None]:
print(loss_values) 

In [None]:
sentences = df_test.clean_text.values
labels = df_test.target.values

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
    input_ids.append(encoded_sent)

In [None]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 


In [None]:
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 32  

In [None]:
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:

print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

model.eval()

predictions , true_labels = [], []

In [None]:
# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

In [None]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [None]:
len(flat_predictions)

In [None]:
predictions=pd.DataFrame(flat_predictions,index=df_test.index)

In [None]:
predictions=predictions.rename(columns={0: "target"})

In [None]:
# write ouput predictions
predictions.to_csv('submission2_bert.csv', index=id, header=True)