<a href="https://colab.research.google.com/github/WajithAAA/My-Project/blob/master/bert_risk_classifier_TPU_aggressive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow as tf
import logging
from tensorflow.keras.layers import (
    Dense,
    Flatten,
    Conv1D,
    Dropout,
    Input,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from tensorflow.keras import regularizers
from transformers import BertTokenizer, TFBertModel
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
tqdm.pandas()
import re
import random

### Setting up TPU

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

Number of replicas: 1


In [None]:
max_length = 512
batch_size = 16
dev_size = 0.1
num_class = 2

In [None]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Dataset/final_data/final_aggressive_train_data.csv')
train_df.label.value_counts()

1    102360
0     95970
Name: label, dtype: int64

In [None]:
drug_df_0 = train_df[train_df['label'] ==0]
drug_df_1 = train_df[train_df['label'] ==1]
drug_df_0_sample=drug_df_1.sample(n=drug_df_0[drug_df_0['label'] ==0].shape[0])
train_df = pd.concat([drug_df_0_sample,drug_df_0])


In [None]:
train_df.label.value_counts()

1    95970
0    95970
Name: label, dtype: int64

In [None]:
train_df = train_df.sample(frac=0.8).reset_index(drop=True)

In [None]:
train_df.label.value_counts()

0    76789
1    76763
Name: label, dtype: int64

In [None]:

train_df.TweetText = train_df.TweetText.astype(str)
train_df.drop_duplicates(inplace=True)
train, dev = train_test_split(train_df, test_size=dev_size, random_state=42)

In [None]:
train_df.shape

(141476, 2)

### Loading data and processing it

In [None]:
import pickle
with open('/content/drive/MyDrive/tokenizers/aggressive_behaviour_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle
with open('/content/drive/MyDrive/tokenizers/aggressive_behaviour_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
def bert_encode(data):
    tokens = tokenizer.batch_encode_plus(
        data, max_length=max_length, padding="max_length", truncation=True
    )
    return tf.constant(tokens["input_ids"])

In [None]:

train_encoded = bert_encode(train.TweetText)
dev_encoded = bert_encode(dev.TweetText)

train_labels = tf.keras.utils.to_categorical(train.label.values, num_classes=num_class)
dev_labels = tf.keras.utils.to_categorical(dev.label.values, num_classes=num_class)

train_dataset = (
    tf.data.Dataset.from_tensor_slices((train_encoded, train_labels))
    .shuffle(100)
    .batch(batch_size)
).cache()

dev_dataset = (
    tf.data.Dataset.from_tensor_slices((dev_encoded, dev_labels))
    .shuffle(100)
    .batch(batch_size)
).cache()

### Model

In [None]:
def bert_tweets_model():
    bert_encoder = TFBertModel.from_pretrained(model_name, output_attentions=True)
    input_word_ids = Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    last_hidden_states = bert_encoder(input_word_ids)[0]
    clf_output = Flatten()(last_hidden_states)
    net = Dense(512, activation="relu")(clf_output)
    net = Dropout(0.3)(net)
    net = Dense(440, activation="relu")(net)
    net = Dropout(0.3)(net)
    output = Dense(num_class, activation="softmax")(net)
    model = Model(inputs=input_word_ids, outputs=output)
    return model

### Compiling Model

In [None]:
with strategy.scope():
  model = bert_tweets_model()
  adam_optimizer = Adam(learning_rate=1e-5)
  model.compile(
      loss="binary_crossentropy", optimizer=adam_optimizer, metrics=["accuracy"]
  )
  model.summary()

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids (InputLayer)      [(None, 512)]             0         
                                                                 
 tf_bert_model (TFBertModel)  TFBaseModelOutputWithPoo  177853440
                             lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             512, 768),                          
                              pooler_output=(None, 76            
                             8),                                 
                              past_key_values=None, h            
                             idden_states=None, atten            
                             tions=((None, 12, None,             
                             512),                               
                              (None, 12, None, 512),         

### Training Model

In [None]:
history = model.fit(
    train_dataset,
    batch_size=batch_size,
    epochs=2,
    validation_data=dev_dataset,
    verbose=1,
)

NameError: ignored

In [None]:
#model.save_weights('/content/drive/MyDrive/save_model/bert_model_weights_for_drug.h5', overwrite=True)

In [None]:
model.save('/content/drive/MyDrive/save_model/bert_model_for_aggressive.h5' , overwrite=True)

In [None]:
new_model = tf.keras.models.load_model('/content/drive/MyDrive/save_model/bert_model_for_aggressive.h5',custom_objects={"TFBertModel": TFBertModel})

### Display Loss and Accuracy

In [None]:

import matplotlib.pyplot as plt
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history["val_" + string])
    plt.title('Emotional model')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, "val_" + string])
    plt.show()
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")    

In [None]:
test = pd.read_csv('/content/drive/MyDrive/Dataset/final_data/final_aggressive_test_data.csv')
test

In [None]:
test.TweetText = test.TweetText.astype(str)
test_encoded = bert_encode(test.TweetText)

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices(test_encoded).batch(batch_size)

In [None]:
test_labels = test['label'].values.tolist()

In [None]:
predicted_tweets = new_model.predict(test_dataset, batch_size=batch_size)
predicted_tweets_binary = np.argmax(predicted_tweets, axis=-1)

In [None]:
predicted_tweets_binary

In [None]:

aggressive_Probability = tf.nn.softmax(predicted_tweets)

aggressive_list = []

for risk in list(aggressive_Probability):
    aggressive_list.append(risk[1])
#test_df['aggressive_score'] = aggressive_list
#print(test_df['aggressive_score'])

Aggressive_avg_score = np.mean(aggressive_list)

print('Aggressive Avg score ',Aggressive_avg_score)

In [None]:
aggressive_Probability

In [None]:
from sklearn import metrics

accuracy = metrics.accuracy_score(test_labels,predicted_tweets_binary)
print('Accuracy: {:.3f}'.format(accuracy))

In [None]:
print('Classification report')
print(metrics.classification_report(test_labels, predicted_tweets_binary))