In [23]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import os
import tensorflow_datasets as tfds
from tqdm import tqdm
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from sklearn.model_selection import train_test_split
print("Hello")

Hello


In [24]:
train_df=pd.read_csv("All_Label_Covid_Headlines.csv")
print(train_df.shape)
train_df,test_df=train_test_split(train_df,test_size=50,shuffle=True,random_state=0)
print(train_df.shape)
print(test_df.shape)

(10727, 5)
(10677, 5)
(50, 5)


In [25]:
print(train_df.Sentiment.value_counts())
print(compute_class_weight('balanced',[0,1],train_df.Sentiment))

1    5345
0    5332
Name: Sentiment, dtype: int64
[1.00121905 0.99878391]


9840     0
1454     0
7224     1
9116     0
        ..
9225     0
4859     0
3264     1
9845     0
2732     1
Name: Sentiment, Length: 10677, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [26]:
def convert_example_to_feature(x):
    
    length=len(x)
    threshold=int(0.3*length)
    x=x[-threshold:]+" "+x
    bert_input = tokenizer.encode_plus(
                        x,                      
                        add_special_tokens = True,
                        max_length = 300,
                        pad_to_max_length = True,
                        return_attention_mask = True)
    return bert_input

In [27]:
def map_example_to_dict(input_ids, attention_masks, Sentiment):
    return {
      "input_ids": input_ids,
      "attention_mask": attention_masks
    }, Sentiment

In [28]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train_df['Headline_Clean'].values, tf.string),
            tf.cast(train_df['Sentiment'].values, tf.int32)
        )
    )
)

In [29]:
test_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(test_df['Headline_Clean'].values, tf.string),
            tf.cast(test_df['Sentiment'].values, tf.int32)
        )
    )
)

In [30]:
def encode_examples(ds, limit=-1):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  attention_mask_list = []
  label_list = []
  if (limit > 0):
      ds = ds.take(limit)
    
  for Headline, Sentiment in tqdm(tfds.as_numpy(ds)):
    bert_input = convert_example_to_feature(Headline.decode())
  
    input_ids_list.append(bert_input['input_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([Sentiment])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list,label_list)).map(map_example_to_dict)

In [31]:
test_ds = encode_examples(test_dataset).batch(16)

0it [00:00, ?it/s]Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
50it [00:00, 424.50it/s]


In [32]:
train_ds = encode_examples(training_dataset).shuffle(10000).batch(16)

10677it [00:09, 1153.60it/s]


In [33]:
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
learning_rate = 1e-5
number_of_epochs = 3
optimizer = tf.keras.optimizers.Nadam(learning_rate=learning_rate)


loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

early=EarlyStopping(monitor='val_accuracy',
                              min_delta=0,
                              patience=3,
                              verbose=1, mode='auto')
checkpoint=ModelCheckpoint(
    "/Users/piyushghasiya/PycharmProjects/News/checkpoint-{epoch:02d}-{val_accuracy:.4f}.h5", monitor='val_accuracy', verbose=1, save_best_only=True,
    save_weights_only=False, mode='auto', save_freq='epoch')

model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [35]:
model.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [36]:
bert_history = model.fit(train_ds, epochs=number_of_epochs, validation_data=test_ds,
                         callbacks=[early,checkpoint],class_weight={0:1.00121905,1:0.99878391})

Epoch 1/3
Epoch 00001: val_accuracy improved from -inf to 0.82000, saving model to /Users/piyushghasiya/PycharmProjects/News/checkpoint-01-0.8200.h5
Epoch 2/3
Epoch 00002: val_accuracy improved from 0.82000 to 0.86000, saving model to /Users/piyushghasiya/PycharmProjects/News/checkpoint-02-0.8600.h5
Epoch 3/3
Epoch 00003: val_accuracy improved from 0.86000 to 0.90000, saving model to /Users/piyushghasiya/PycharmProjects/News/checkpoint-03-0.9000.h5


In [65]:
df=pd.read_csv("Preprocessed_Headline_Japan.csv")
df.head()

Unnamed: 0,Month,Date,Headline,Headline_Clean
0,August,14 Aug 2020,Japan and Malaysia may resume travel in early ...,japan malaysia may resume travel early septemb...
1,August,15 Aug 2020,Japan marks 75th surrender anniversary in sole...,japan mark surrender anniversary solemn ceremo...
2,August,15 Aug 2020,Indonesia to close doors to tourists until vac...,indonesia close doors tourists vaccine find
3,August,15 Aug 2020,How COVID-19 has reshaped Japan's drinking cul...,reshape japan drink culture
4,August,15 Aug 2020,COVID-19 ruins plans to spend time at the beac...,ruin plan spend time beach among things


In [66]:
input_data={}
input_data['input_ids']=[]
# input_data['token_type_ids']=[]
input_data['attention_mask']=[]

for i in tqdm(df.Headline_Clean):
    bert_output=convert_example_to_feature(i)
    input_data['input_ids'].append(bert_output['input_ids'])
#     input_data['token_type_ids'].append(bert_output['token_type_ids'])
    input_data['attention_mask'].append(bert_output['attention_mask'])

100%|██████████| 21038/21038 [00:06<00:00, 3451.43it/s]


In [67]:
input_data['input_ids']=np.array(input_data['input_ids'])
# input_data['token_type_ids']=np.array(input_data['token_type_ids'])
input_data['attention_mask']=np.array(input_data['attention_mask'])

In [68]:
from tensorflow.keras.models import load_model
model.load_weights("/Users/piyushghasiya/PycharmProjects/News/checkpoint-03-0.9000.h5")
pred=model.predict(input_data)

In [69]:
final_preds=np.argmax(pred[0],axis=1)

In [70]:
df['Sentiment']=final_preds

In [71]:
df

Unnamed: 0,Month,Date,Headline,Headline_Clean,Sentiment
0,August,14 Aug 2020,Japan and Malaysia may resume travel in early ...,japan malaysia may resume travel early septemb...,1
1,August,15 Aug 2020,Japan marks 75th surrender anniversary in sole...,japan mark surrender anniversary solemn ceremo...,1
2,August,15 Aug 2020,Indonesia to close doors to tourists until vac...,indonesia close doors tourists vaccine find,0
3,August,15 Aug 2020,How COVID-19 has reshaped Japan's drinking cul...,reshape japan drink culture,1
4,August,15 Aug 2020,COVID-19 ruins plans to spend time at the beac...,ruin plan spend time beach among things,0
...,...,...,...,...,...
21033,September,"September 9, 2020 at 16:17 JST",China's CanSino defends coronavirus vaccine ca...,china cansino defend coronavirus vaccine candi...,0
21034,September,"September 9, 2020 at 17:15 JST",U.S. firms in China increasingly fear bilatera...,firm china increasingly fear bilateral tension...,0
21035,September,"September 9, 2020 at 17:15 JST",South Korea president to hold emergency meetin...,south korea president hold emergency meet thur...,0
21036,September,"September 9, 2020 at 17:18 JST",Tokyo in July sees more people moving out than...,tokyo july see people move due virus,0


In [72]:
output=df[['Month','Date','Headline','Headline_Clean','Sentiment']]
output.to_csv("Japan_Headline_Clean_Covid-19_BERT_90.csv",index=None)