In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
import pickle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
import itertools
from sklearn.utils import shuffle
from tensorflow.keras import regularizers
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import nltk
nltk.download('stopwords')
  

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w


In [4]:
data_file='Documents/TWITTERDS.csv'
data=pd.read_csv(data_file,encoding='ISO-8859-1')
len(data)


10076

In [5]:
print('Available labels: ',data.TAR.unique())
data['text']=data['DESCRIPTION'].map(preprocess_sentence)

num_classes=len(data.TAR.unique())

data.head()

Available labels:  [0 1]


Unnamed: 0,TAR,ID,DATE,QUERY,NAME,DESCRIPTION,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com awww bummer should...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many times ball managed save re...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


In [6]:
num_classes

2

In [7]:
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')   

In [8]:
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

In [9]:
max_len=32
sentences=data['text']
print(sentences)
labels=data['TAR']
len(sentences),len(labels)

0        switchfoot http twitpic com awww bummer should...
1        upset update facebook texting might cry result...
2        kenichan dived many times ball managed save re...
3                         whole body feels itchy like fire
4                         nationwideclass behaving mad see
                               ...                        
10071            iamdiddy working sunday hello philippines
10072    johnmayerlyrics whats difference twitter page ...
10073                think need akai headrush got one give
10074    looks like marginal good thermals today gettin...
10075    sammutimer saviour last one season unless cour...
Name: text, Length: 10076, dtype: object


(10076, 10076)

In [10]:
dbert_tokenizer.tokenize(sentences[0])

['switch',
 '##foot',
 'http',
 't',
 '##wi',
 '##tp',
 '##ic',
 'com',
 'aw',
 '##w',
 '##w',
 'bum',
 '##mer',
 'should',
 '##a',
 'got',
 'david',
 'carr',
 'third',
 'day']

In [11]:
dbert_inp=dbert_tokenizer.encode_plus(sentences[0],add_special_tokens = True,max_length =20,pad_to_max_length = True,truncation=True)
dbert_inp

{'input_ids': [101, 6942, 13064, 8299, 1056, 9148, 25856, 2594, 4012, 22091, 2860, 2860, 26352, 5017, 2323, 2050, 2288, 2585, 12385, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
dbert_inp['input_ids']

[101,
 6942,
 13064,
 8299,
 1056,
 9148,
 25856,
 2594,
 4012,
 22091,
 2860,
 2860,
 26352,
 5017,
 2323,
 2050,
 2288,
 2585,
 12385,
 102]

In [13]:
id_inp=np.asarray(dbert_inp['input_ids'])
mask_inp=np.asarray(dbert_inp['attention_mask'])
out=dbert_model([id_inp.reshape(1,-1),mask_inp.reshape(1,-1)])
type(out),out

(tuple,
 (<tf.Tensor: shape=(1, 20, 768), dtype=float32, numpy=
  array([[[ 0.01274627, -0.07876229, -0.15836816, ..., -0.10738077,
            0.35887986,  0.42442632],
          [ 0.22567368, -0.3502166 ,  0.3199818 , ...,  0.07449917,
            0.55056846, -0.18079731],
          [ 0.21035773,  0.16083698,  0.00888642, ...,  0.15392008,
            0.02060527,  0.01076494],
          ...,
          [ 0.04942653,  0.12789088,  0.21721028, ..., -0.37591136,
            0.09543035, -0.08466615],
          [ 0.0212442 ,  0.00257167,  0.0693761 , ..., -0.18621549,
           -0.17315246,  0.02489012],
          [ 0.98219776,  0.2841664 , -0.39852607, ..., -0.02572096,
           -0.36131865, -0.17120071]]], dtype=float32)>,))

In [14]:
out[0][:,0,:]

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[ 1.27462726e-02, -7.87622929e-02, -1.58368155e-01,
        -1.28659427e-01, -1.97247729e-01, -1.68835819e-01,
         3.88083041e-01,  4.46289718e-01, -1.92302316e-01,
        -1.34538710e-01,  1.54768839e-01, -1.87388524e-01,
        -9.57167894e-02,  2.19261020e-01, -8.07700679e-02,
         1.70546710e-01, -3.33832204e-01,  2.25277394e-01,
         1.36261553e-01, -4.15344909e-02,  2.60007620e-01,
        -1.72635004e-01, -2.99528427e-02, -2.39733353e-01,
        -8.55646431e-02,  5.08861281e-02, -1.23486415e-01,
        -1.57683849e-01, -3.45814787e-02,  3.88169326e-02,
         1.46441117e-01,  8.02083984e-02,  1.36188000e-01,
        -2.96066284e-01,  8.79974663e-02, -1.78958308e-02,
         1.85671076e-01, -9.04827565e-03,  2.90242583e-01,
         2.34690994e-01, -1.61147237e-01, -6.57327473e-05,
         1.28944188e-01, -6.62079826e-03,  8.98613036e-02,
        -8.81270319e-02, -2.52564907e+00,  2.04021662e-01,
      

In [15]:
dbert_tokenizer.decode(dbert_inp['input_ids'])

'[CLS] switchfoot http twitpic com awww bummer shoulda got david carr [SEP]'

In [16]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(2, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model   

In [17]:
model=create_model()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the f

 dense (Dense)                  (None, 512)          393728      ['tf.__operators__.getitem[0][0]'
                                                                 ]                                
                                                                                                  
 dropout_19 (Dropout)           (None, 512)          0           ['dense[0][0]']                  
                                                                                                  
 dense_1 (Dense)                (None, 2)            1026        ['dropout_19[0][0]']             
                                                                                                  
Total params: 66,757,634
Trainable params: 66,757,634
Non-trainable params: 0
__________________________________________________________________________________________________
None


In [18]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    dbert_inps=dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids.append(dbert_inps['input_ids'])
    attention_masks.append(dbert_inps['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

In [19]:
len(input_ids),len(attention_masks),len(labels)

(10076, 10076, 10076)

In [20]:
print('Preparing the pickle file.....')

pickle_inp_path='./dbert_inp.pkl'
pickle_mask_path='./dbert_mask.pkl'
pickle_label_path='./dbert_label.pkl'

pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((labels),open(pickle_label_path,'wb'))


print('Pickle files saved as ',pickle_inp_path,pickle_mask_path,pickle_label_path)

Preparing the pickle file.....
Pickle files saved as  ./dbert_inp.pkl ./dbert_mask.pkl ./dbert_label.pkl


In [21]:
print('Loading the saved pickle files..')

input_ids=pickle.load(open(pickle_inp_path, 'rb'))
attention_masks=pickle.load(open(pickle_mask_path, 'rb'))
labels=pickle.load(open(pickle_label_path, 'rb'))
print(input_ids.shape)
print(attention_masks.shape)
#print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape))


Loading the saved pickle files..
(10076, 32)
(10076, 32)


In [22]:
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))


log_dir='dbert_model'
model_save_path='./dbert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(loss=loss,optimizer=optimizer, metrics=[metric])



Train inp shape (8060, 32) Val input shape (2016, 32)
Train label shape (8060,) Val label shape (2016,)
Train attention mask shape (8060, 32) Val attention mask shape (2016, 32)


In [23]:
callbacks= [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]
model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

In [24]:
history=model.fit([train_inp,train_mask],train_label,batch_size=1500,epochs=5,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


  return dispatch_target(*args, **kwargs)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
trained_model = create_model()
trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights(model_save_path)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  ((None, 32, 768),   66362880    ['input_3[0][0]',                
 BertModel)                     )                                 'input_4[0][0]']                
                                                                                                  
 tf.__operators__.getitem_1 (Sl  (None, 768)         0           ['tf_distil_bert_model[1][0

In [26]:
preds = trained_model.predict([val_inp,val_mask],batch_size=1500)
pred_labels = preds.argmax(axis=1)
f1 = f1_score(val_label,pred_labels)
f1

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


0.7265002655337227

In [27]:
target_names=['negative','positive']
print('F1 score',f1)
print('Classification Report')
print(classification_report(val_label,pred_labels,target_names=target_names))
scores = model.evaluate([val_inp,val_mask],val_label, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))
print('Training and saving built model.....')   
from sklearn.metrics import confusion_matrix, classification_report
labels = ['negative','positive']
print(classification_report(val_label, pred_labels))
pd.DataFrame(confusion_matrix(val_label, pred_labels), index=labels, columns=labels)

F1 score 0.7265002655337227
Classification Report
              precision    recall  f1-score   support

    negative       0.70      0.83      0.76       979
    positive       0.81      0.66      0.73      1037

    accuracy                           0.74      2016
   macro avg       0.75      0.75      0.74      2016
weighted avg       0.75      0.74      0.74      2016

Accuracy: 74.45%
Training and saving built model.....
              precision    recall  f1-score   support

           0       0.70      0.83      0.76       979
           1       0.81      0.66      0.73      1037

    accuracy                           0.74      2016
   macro avg       0.75      0.75      0.74      2016
weighted avg       0.75      0.74      0.74      2016



Unnamed: 0,negative,positive
negative,817,162
positive,353,684
