In [None]:
!pip install transformers

In [None]:
import pandas as pd
# Loading Dependencies
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
#from kaggle_datasets import KaggleDatasets
import transformers

from transformers import DistilBertTokenizer, RobertaTokenizer
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

import os
from tqdm import tqdm,trange

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
data_file_address = '/Kaggle/input/news_category_dataset/News_Category_Dataset_v2.json'
# Fillna method can make same sentence with same sentence name
#df_data = pd.read_csv(data_file_address,sep=",",encoding="latin1").fillna(method='ffill')
df_data = pd.read_json('/kaggle/input/news-category-dataset/News_Category_Dataset_v2.json',lines=True)

In [None]:
df_data.head(10)

In [None]:
#working with only these datas
#data = df_data[df_data['category'] in ['POLITICS','ENTERTAINMENT','BUSINESS','SPORTS','TECH']]
data = df_data.loc[df_data['category'].isin(['POLITICS','ENTERTAINMENT','BUSINESS','SPORTS','TECH'])]

In [None]:
data = data[['category','headline']]

In [None]:
data.head(5)

In [None]:
#Encode the Labels
label_index = {'POLITICS':0,'ENTERTAINMENT':4,'BUSINESS':2,'SPORTS':3,'TECH':1}

data['category'] = [label_index[i] for i in data['category']]

In [None]:
data_ = data.sort_values(by='category',ascending=True)
data_.head(10)

In [None]:
data_.category.value_counts()

In [None]:
data_ = data_.iloc[18000:]

In [None]:
data_.category.value_counts()

In [None]:
data = data_

In [None]:
data = shuffle(data)
data.head(20)

In [None]:
#from transformers import DistilBertTokenizer, RobertaTokenizer, 
distil_bert = 'distilbert-base-uncased' # Pick any desired pre-trained model
roberta = 'roberta-base-uncase'

# Defining DistilBERT tokonizer
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)


In [None]:
def tokenize(sentences, tokenizer,MAX_LENGTH=512):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=MAX_LENGTH, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


In [None]:
input_ids, input_masks, input_segments = tokenize(data.headline.astype(str),tokenizer)


In [None]:
labels = data.category.values
print(labels)

In [None]:
#IMP DATA FOR CONFIG
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 3
#BATCH_SIZE = 16 * strategy.num_replicas_in_sync
BATCH_SIZE = 16
#MAX_LEN = 192

In [None]:
print(BATCH_SIZE)

In [None]:
x_train, x_test, train_masks, test_masks,train_segs, test_segs, y_train, y_test = train_test_split(
                                                            input_ids, input_masks, input_segments, labels, 
                                                            random_state=42, test_size=0.3)

In [None]:
x_val, x_test,val_masks, test_masks,val_segs, test_segs, y_val , y_test = train_test_split(x_test, test_masks,test_segs, y_test,
                                                            random_state=42, test_size=0.5)

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_val, y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
def build_model(transformer_model,max_len=512):

  
  input_ids_in = tf.keras.layers.Input(shape=(max_len,), name='input_token', dtype='int32')
  #input_masks_in = tf.keras.layers.Input(shape=(max_len,), name='masked_token', dtype='int32') 

  embedding_layer = transformer_model(input_ids_in)[0]#, attention_mask=input_masks_in)[0]
  cls_token = embedding_layer[:, 0, :]
  #layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
  #layer = tf.keras.layers.GlobalMaxPool1D()(layer2)
  #layer = tf.keras.layers.Dense(50, activation='relu')(layer)
  #layer = tf.keras.layers.Dropout(0.2)(layer)
  layer = tf.keras.layers.Dense(5, activation='softmax')(cls_token)
  model = tf.keras.Model(inputs=[input_ids_in], outputs = layer)

  #for layer in model.layers[:3]:
  #  layer.trainable = False

  return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-uncased')
    )
    model = build_model(transformer_layer, max_len=512)
model.summary()

#bert_model = build_model(transformer_model,MAX_LENGTH)
#bert_model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) #, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=5,
)

In [None]:
accuracy = model.evaluate(x_test,y_test)
print("Test Loss -> ",accuracy[0])
print("Accuracy -> ",accuracy[1])

Model 2

In [None]:
def build_model(transformer_model,max_len=512):

  input_ids_in = tf.keras.layers.Input(shape=(max_len,), name='input_token', dtype='int32')
  #input_masks_in = tf.keras.layers.Input(shape=(max_len,), name='masked_token', dtype='int32') 

  embedding_layer = transformer_model(input_ids_in)[0]#, attention_mask=input_masks_in)[0]
  #cls_token = embedding_layer[:, 0, :]
  layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
  layer = tf.keras.layers.GlobalMaxPool1D()(layer)
  layer = tf.keras.layers.Dense(50, activation='relu')(layer)
  layer = tf.keras.layers.Dropout(0.2)(layer)
  layer = tf.keras.layers.Dense(5, activation='softmax')(layer)
  model = tf.keras.Model(inputs=[input_ids_in], outputs = layer)

  for layer in model.layers[:3]:
    layer.trainable = False

  return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-uncased')
    )
    model2 = build_model(transformer_layer, max_len=512)
model2.summary()

#bert_model = build_model(transformer_model,MAX_LENGTH)
#bert_model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) #, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model2.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model2.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=5,
)

In [None]:
accuracy2 = model2.evaluate(x_test,y_test)
print("Test Loss -> ",accuracy2[0])
print("Accuracy -> ",accuracy2[1])

Trainable set to False

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-uncased')
    )
    model4 = build_model(transformer_layer, max_len=512)
model4.summary()

#bert_model = build_model(transformer_model,MAX_LENGTH)
#bert_model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) #, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model4.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model4.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=5,
)

Model 3

In [None]:
def build_model(transformer_model,max_len=512):

  input_ids_in = tf.keras.layers.Input(shape=(max_len,), name='input_token', dtype='int32')
  
  embedding_layer = transformer_model(input_ids_in)[0]
  layer = tf.keras.layers.GlobalMaxPool1D()(embedding_layer)
  layer = tf.keras.layers.Dropout(0.2)(layer)
  layer = tf.keras.layers.Dense(5, activation='softmax')(layer)
  model = tf.keras.Model(inputs=[input_ids_in], outputs = layer)

  #for layer in model.layers[:3]:
  #  layer.trainable = False

  return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-uncased')
    )
    model3 = build_model(transformer_layer, max_len=512)
model3.summary()

#bert_model = build_model(transformer_model,MAX_LENGTH)
#bert_model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) #, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model3.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model3.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=5,
)

In [None]:
accuracy3 = model3.evaluate(x_test,y_test)
print("Test Loss -> ",accuracy3[0])
print("Accuracy -> ",accuracy3[1])

# Using a smaller batch size

In [None]:
def build_model(transformer_model,max_len=512):

  input_ids_in = tf.keras.layers.Input(shape=(max_len,), name='input_token', dtype='int32')
  
  embedding_layer = transformer_model(input_ids_in)[0]
  layer = tf.keras.layers.GlobalMaxPool1D()(embedding_layer)
  layer = tf.keras.layers.Dropout(0.2)(layer)
  layer = tf.keras.layers.Dense(5, activation='softmax')(layer)
  model = tf.keras.Model(inputs=[input_ids_in], outputs = layer)

  #for layer in model.layers[:3]:
  #  layer.trainable = False

  return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-uncased')
    )
    model5 = build_model(transformer_layer, max_len=512)
model5.summary()

#bert_model = build_model(transformer_model,MAX_LENGTH)
#bert_model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) #, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model5.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model5.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=5,
)

In [None]:
accuracy45 = model3.evaluate(x_test,y_test)
print("Test Loss -> ",accuracy45[0])
print("Accuracy -> ",accuracy45[1])

Training the First Model with Batch Size 16

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-uncased')
    )
    model6 = build_model(transformer_layer, max_len=512)
model6.summary()

#bert_model = build_model(transformer_model,MAX_LENGTH)
#bert_model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) #, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model6.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model6.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=5,
)

In [None]:
accuracy6 = model3.evaluate(x_test,y_test)
print("Test Loss -> ",accuracy6[0])
print("Accuracy -> ",accuracy6[1])

In [None]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [None]:
# encode context the generation is conditioned on
input_ids = tokenizer.encode('We should stay home now and enjoy because', return_tensors='tf')

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=50, 
    top_k=50, 
    top_p=0.95, 
    num_return_sequences=3
)



In [None]:
print(" Greedy Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))
print('\n')


print("After modifying the parameters - Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {} \n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
print(" Greedy Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))
print('\n')


print("After modifying the parameters - Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    print("{}: {} \n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))