<a href="https://colab.research.google.com/github/Varun-Mulchandani/Reddit-Flair-Classifier/blob/master/BERT_uncased(24%2C_1024%2C_16).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [3]:
!pip install sentencepiece



In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

In [0]:
def bert_encode(texts, tokenizer, max_len = 512):
  all_tokens = []
  all_masks = []
  all_segments = []

  for text in texts:
    text = tokenizer.tokenize(text)
    text = text[:max_len - 2]
    input_sequence = ['[CLS]'] + text + ['[SEP]']
    pad_len = max_len - len(input_sequence)

    tokens = tokenizer.convert_tokens_to_ids(input_sequence)

    tokens += [0]* pad_len
    pad_masks = [1] * len(input_sequence) + [0] * pad_len
    segment_ids = [0] * max_len

    all_tokens.append(tokens)
    all_masks.append(pad_masks)
    all_segments.append(segment_ids)

  return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


In [0]:
def build_model(bert_layer, max_len = 512):
  input_word_ids = Input(shape=(max_len,), dtype = tf.int32, name='input_word_ids')
  input_mask = Input(shape=(max_len,), dtype = tf.int32, name='input_mask')
  segment_ids = Input(shape=(max_len,), dtype=tf.int32, name='segment_ids')

  _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
  clf_output = sequence_output[:, 0, :]
  dense_layer1 = Dense(units=256, activation='relu')(clf_output)
  dense_layer1 = Dropout(0.4)(dense_layer1)
  dense_layer2 = Dense(units=128, activation='relu')(dense_layer1)
  dense_layer2 = Dropout(0.4)

  out = Dense(12, activation = 'softmax')(clf_output)

  model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = out)
  model.compile(Adam(lr=2e-6), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

  return model

In [0]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1'
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [0]:
train = pd.read_csv('reddddddit.csv')

In [9]:
flairs = []
for i in list(train['flair']):
  if i in flairs:
    pass
  else:
    flairs.append(i)
print(flairs)
count = 0
label_to_id = {}
for i in flairs:
  label_to_id[i] = count
  count += 1
print(label_to_id)

['AskIndia', 'Non-Political', '[R]eddiquette', 'Scheduled', 'Photography', 'Science/Technology', 'Politics', 'Business/Finance', 'Policy/Economy', 'Sports', 'Food', 'AMA']
{'AskIndia': 0, 'Non-Political': 1, '[R]eddiquette': 2, 'Scheduled': 3, 'Photography': 4, 'Science/Technology': 5, 'Politics': 6, 'Business/Finance': 7, 'Policy/Economy': 8, 'Sports': 9, 'Food': 10, 'AMA': 11}


In [0]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [0]:
train_input = bert_encode(train.title.values, tokenizer, max_len = 160)
train_labels = train.flair.values
trainl = []
for i in train_labels:
  trainl.append(label_to_id[i])
trainl = np.array(trainl)

In [12]:
train_input

(array([[  101,  2342, 12247, ...,     0,     0,     0],
        [  101,  8307,  2215, ...,     0,     0,     0],
        [  101,  6059,  2634, ...,     0,     0,     0],
        ...,
        [  101,  2172,  4748, ...,     0,     0,     0],
        [  101, 25933,  8874, ...,     0,     0,     0],
        [  101, 25933,  2172, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [13]:
trainl

array([ 0,  0,  0, ..., 11, 11, 11])

In [14]:
model = build_model(bert_layer, max_len = 160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [15]:
train_history = model.fit(
    train_input, trainl,
    validation_split=0.2,
    epochs=3,
    batch_size=16
)

model.save(model.h5)

Epoch 1/3
Epoch 2/3
Epoch 3/3


AttributeError: ignored

In [0]:
model.save('model.h5')

In [0]:
def generate_flair(sentence):
  sentence = tokenizer.tokenize(sentence)
  sentence = sentence[:512 - 2]
  input_sequence = ['[CLS]'] + sentence + ['[SEP]']
  pad_len = 512 - len(input_sequence)
  tokens = tokenizer.convert_tokens_to_ids(input_sequence)
  tokens += [0]* pad_len
  pad_masks = [1] * len(input_sequence) + [0] * pad_len
  segment_ids = [0] * 512

  input_s = (np.array(tokens), np.array(pad_masks), np.array(segment_ids))
  outp = model.predict(input_s)
  return outp

In [33]:
generate_flair('Zomato delivery man killed')

array([[0.13498396, 0.1654714 , 0.3762691 , ..., 0.00997647, 0.08732126,
        0.02405257],
       [0.05218387, 0.24571384, 0.06196211, ..., 0.12916678, 0.06740196,
        0.16390336],
       [0.07201799, 0.1924517 , 0.13861717, ..., 0.13038906, 0.07013833,
        0.1624523 ],
       ...,
       [0.03216378, 0.24066208, 0.14364594, ..., 0.01297687, 0.0549913 ,
        0.08683963],
       [0.03216293, 0.24065708, 0.14364897, ..., 0.01297657, 0.05499127,
        0.08684047],
       [0.03216293, 0.24065708, 0.14364897, ..., 0.01297657, 0.05499127,
        0.08684047]], dtype=float32)