# Natural Language Processing with Disaster Tweets

*In this Notebook I have worked on the Natural Language Processing with Disaster tweets in which we do the sentiment analysis of whether a tweet is a Disaster based or not by building models based on neural network model  **distilBERT** which is a smaller version of BERT, a state of the art Language Processing model*

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers
!pip install transformers
#from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel

[0m

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

*We clean the tweet messages with the stop words*

In [6]:
nltk.download('stopwords')
def clean_stopwords_shortwords(w):
    stopwords_list= stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


*The unicode strings are converted to ascii strings*

In [9]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


*All special characters are removed with the function below*

In [10]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

In [11]:
train['text'] = train['text'].map(preprocess_sentence)

In [12]:
train['text']

0               deeds reason earthquake may allah forgive
1                      forest fire near ronge sask canada
2       residents asked shelter place notified officer...
3       people receive wildfires evacuation orders cal...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    aria ahrary thetawniest control wild fires cal...
7610                   utc volcano hawaii http zdtoyd ebj
7611    police investigating bike collided car little ...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

*The  tokenizer of the **distilbert-base-uncased** is imported from the transformers package and pretrained model is downloaded from the transformers package*

In [13]:
from transformers import DistilBertTokenizer
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [14]:
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2022-05-26 18:30:16.142848: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-26 18:30:16.144653: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-26 18:30:16.145687: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-26 18:30:16.147065: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [15]:
max_len=32
sentences=train['text']
labels=train['target']
len(sentences),len(labels)

(7613, 7613)

In [16]:
dbert_tokenizer.tokenize(sentences[0])

['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive']

In [17]:
dbert_inp=dbert_tokenizer.encode_plus(sentences[0],add_special_tokens = True,max_length =20,pad_to_max_length = True,truncation=True)
dbert_inp



{'input_ids': [101, 15616, 3114, 8372, 2089, 16455, 9641, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

*The pretrained model needs two inputs, input id as tokens and attention masks, so every input needs to be padded, so this is done on a single sentence and we take a look at it*

In [18]:
id_inp=np.asarray(dbert_inp['input_ids'])
mask_inp=np.asarray(dbert_inp['attention_mask'])
out=dbert_model([id_inp.reshape(1,-1),mask_inp.reshape(1,-1)])
type(out),out

(transformers.modeling_tf_outputs.TFBaseModelOutput,
 TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 20, 768), dtype=float32, numpy=
 array([[[-0.17400928,  0.05031742, -0.25087613, ..., -0.007099  ,
           0.32299748,  0.13289091],
         [ 0.26510975,  0.2327715 ,  0.22081394, ...,  0.0275039 ,
          -0.03581193,  0.10404292],
         [-0.43689352,  0.18955605,  0.25972164, ..., -0.1418427 ,
          -0.06919396,  0.14695333],
         ...,
         [ 0.0622292 ,  0.17262122,  0.17678715, ...,  0.15874153,
          -0.04708431, -0.08581738],
         [-0.11230349,  0.3852726 ,  0.34410006, ..., -0.10992186,
          -0.05016324, -0.10346158],
         [ 0.03663736,  0.2028245 ,  0.23218796, ...,  0.00811235,
           0.00435018, -0.0390332 ]]], dtype=float32)>, hidden_states=None, attentions=None))

In [None]:
id_inp

*We create a NN model based by adding the distilbert layer to the model*

In [19]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(2, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model   

In [20]:
model=create_model()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB TFBaseModelOutput(la 66362880    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 768)          0           tf_distil_bert_model[0][0]   

*We create input token ids and attention masks for all the sentences and take it into two variables **input_ids** and **attention_masks** variables*

In [21]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    dbert_inps=dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids.append(dbert_inps['input_ids'])
    attention_masks.append(dbert_inps['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

In [None]:
  len(input_ids),len(attention_masks),len(labels)

In [22]:
#train test split is done 

train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))



Train inp shape (6090, 32) Val input shape (1523, 32)
Train label shape (6090,) Val label shape (1523,)
Train attention mask shape (6090, 32) Val attention mask shape (1523, 32)


In [23]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

In [24]:
#callbacks= [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]
model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

In [25]:
history=model.fit([train_inp,train_mask],train_label,batch_size=16,epochs=5,validation_data=([val_inp,val_mask],val_label))


2022-05-26 18:31:17.554520: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5


  '"`sparse_categorical_crossentropy` received `from_logits=True`, but '


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
#input_ids.shape
attention_masks.shape

(7613, 32)

In [34]:
input_ids_test=[]
attention_masks_test=[]


for sent in test['text']:
    dbert_inps_test=dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids_test.append(dbert_inps_test['input_ids'])
    attention_masks_test.append(dbert_inps_test['attention_mask'])

In [36]:
input_ids_test=np.asarray(input_ids_test)
attention_masks_test=np.array(attention_masks_test)
y_predict = model.predict([input_ids_test,attention_masks_test])

In [37]:
predict = np.argmax(y_predict,axis=1) 

In [38]:
predict

array([1, 1, 1, ..., 1, 0, 1])

In [39]:
df_submission = pd.DataFrame({'id':test['id'].values,
                             'target':predict})

In [40]:
df_submission.to_csv('submission.csv', index=False)