<a href="https://colab.research.google.com/github/VeereshShringari/NLP-DHS-SRK/blob/master/8_TextClassification_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Deep Learning Models using Word Embeddings

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, GRU
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

### Data Preparation

In [6]:
data_path = "/content/drive/My Drive/DHS2019_Workshop/Disaster/"
df = pd.read_csv(data_path + "socialmedia_disaster_tweets.csv", encoding='iso-8859-1')
# we are proving is label is releavant
df = df[["choose_one", "text"]]
df.columns = ["label", "text"]
df = df[df["label"].isin(["Relevant", "Not Relevant"])].reset_index(drop=True)
df.head()

Unnamed: 0,label,text
0,Relevant,Just happened a terrible car crash
1,Relevant,Our Deeds are the Reason of this #earthquake M...
2,Relevant,"Heard about #earthquake is different cities, s..."
3,Relevant,"there is a forest fire at spot pond, geese are..."
4,Relevant,Forest fire near La Ronge Sask. Canada


### Tokenization

In [0]:
## some config values 
# we use without pre-trained embeddings
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use
#small text is 20 -30 otherwise 95% percentile 

## fill up the missing values
X = df["text"].astype(str).fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features, oov_token='<UNK>')
tokenizer.fit_on_texts(list(X))
X = tokenizer.texts_to_sequences(X)

## Pad the sentences 
X = pad_sequences(X, maxlen=maxlen)

In [11]:
X


array([[    0,     0,     0, ...,  1516,   132,    97],
       [    0,     0,     0, ...,  3840,    89,    42],
       [    0,     0,     0, ...,   651,  1517,   275],
       ...,
       [    0,     0,     0, ...,     2,     3, 29298],
       [    0,     0,     0, ...,     2,     3,  8058],
       [    0,     0,     0, ...,    76,   260,  3221]], dtype=int32)

In [0]:
label_map = {"Not Relevant":0, "Relevant":1}
y = (df["label"].map(label_map)).values

### GPU Availability

Setting up GPU Colab: https://colab.research.google.com/notebooks/gpu.ipynb

In [10]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


### Model Function

In [0]:
def run_model(X, y, val_X, val_y, batch_size=128, n_epochs=1):
  # GRU model has been used here RNN 
  inp = Input(shape=(maxlen,))
  x = Embedding(max_features, embed_size)(inp)
  # learning from the both direction
  x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

  x = GlobalMaxPool1D()(x)
  # This takes Maximum from the ten neurons used for next thing
  x = Dense(16, activation="relu")(x)
  x = Dropout(0.1)(x)
  x = Dense(1, activation="sigmoid")(x)
  model = Model(inputs=inp, outputs=x)
  # since it is binary classification we use binary_crossentropy
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  # batch size needs to be ptimised
  model.fit(X, y, batch_size=batch_size, epochs=n_epochs, validation_data=(val_X, val_y))

  pred_val_y = model.predict([val_X], batch_size=1024, verbose=1)

  return model, pred_val_y

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=2019)
cv_preds = np.zeros(X.shape[0])
for dev_index, val_index in kf.split(X):
    dev_X, val_X = X[dev_index,:], X[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]

    model, preds_val = run_model(dev_X, dev_y, val_X, val_y)
    cv_preds[val_index] = preds_val.ravel()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 8688 samples, validate on 2172 samples
Train on 8688 samples, validate on 2172 samples
Train on 8688 samples, validate on 2172 samples
Train on 8688 samples, validate on 2172 samples
Train on 8688 samples, validate on 2172 samples


In [14]:
from sklearn import metrics
metrics.roc_auc_score(y, cv_preds)

0.8647377160320866

In [15]:
print(metrics.classification_report(y, (cv_preds>0.5)))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84      6187
           1       0.81      0.71      0.76      4673

    accuracy                           0.80     10860
   macro avg       0.81      0.79      0.80     10860
weighted avg       0.80      0.80      0.80     10860



### Pre-trained Embeddings

In [0]:
EMBEDDING_FILE = "/content/drive/My Drive/DHS2019_Workshop/Disaster/w2vec.txt" 
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if self.run_code(code, result):


In [0]:
def run_model(X, y, val_X, val_y, batch_size=128, n_epochs=1):
  inp = Input(shape=(maxlen,))
  x = Embedding(embedding_matrix.shape[0], embed_size, weights=[embedding_matrix])(inp)
  x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
  x = GlobalMaxPool1D()(x)
  x = Dense(16, activation="relu")(x)
  x = Dropout(0.1)(x)
  x = Dense(1, activation="sigmoid")(x)
  model = Model(inputs=inp, outputs=x)
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(X, y, batch_size=batch_size, epochs=n_epochs, validation_data=(val_X, val_y))

  pred_val_y = model.predict([val_X], batch_size=1024, verbose=1)

  return model, pred_val_y

In [0]:
kf = KFold(n_splits=5, shuffle=True, random_state=2019)
cv_preds = np.zeros(X.shape[0])
for dev_index, val_index in kf.split(X):
    dev_X, val_X = X[dev_index,:], X[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]

    model, preds_val = run_model(dev_X, dev_y, val_X, val_y)
    cv_preds[val_index] = preds_val.ravel()

Train on 8688 samples, validate on 2172 samples
Train on 8688 samples, validate on 2172 samples
Train on 8688 samples, validate on 2172 samples
Train on 8688 samples, validate on 2172 samples
Train on 8688 samples, validate on 2172 samples


In [0]:
from sklearn import metrics
metrics.roc_auc_score(y, cv_preds)

0.8596032471252014

In [0]:
print(metrics.classification_report(y, (cv_preds>0.5)))

              precision    recall  f1-score   support

           0       0.79      0.89      0.83      6187
           1       0.82      0.68      0.75      4673

    accuracy                           0.80     10860
   macro avg       0.80      0.79      0.79     10860
weighted avg       0.80      0.80      0.80     10860

