In [1]:
# import libray
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.feature_extraction import _stop_words
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report



In [2]:
# read data from csv file
data = pd.read_csv("/kaggle/input/dataset100/CleaneSet_2_Class.csv")
data.head()

Unnamed: 0,label,text
0,1,disabled vehicle westbound highway emily drive...
1,0,new teacher lunch amp training marker wars w s...
2,0,spot uhaultrends canadian destination cites co...
3,0,years ago today mlk gave historic dream speech
4,0,aww itâs hard say goodbye whatâs favorite ...


In [3]:
# count the number of label
data["label"].value_counts()

label
1    25549
0    25549
Name: count, dtype: int64

In [None]:
# data preprocessing using spacy
import spacy
import string

nlp = spacy.load('en_core_web_lg')

stopwords = _stop_words.ENGLISH_STOP_WORDS
lemmatizer = WordNetLemmatizer()

# data cleaning function
def clean(doc):
    text_no_namedentities = []
    document = nlp(doc)
    ents = [e.text for e in document.ents]
    for item in document:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
            
    doc = (" ".join(text_no_namedentities))
    doc = doc.lower().strip()
    doc = doc.replace("</br>", " ") 
    doc = doc.replace("-", " ") 
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])    
    doc = "".join([lemmatizer.lemmatize(word) for word in doc])
    
    return doc

In [None]:
# cleaning the tweet data
data['text'] = data['text'].apply(clean)
data.head()

In [4]:
# load the data in x and y variable
X = data['text']
y = data['label']
print(X.shape, y.shape)

(51098,) (51098,)


In [5]:
from sklearn.model_selection import train_test_split

# spliting the data in trainig set and test set
SEED=123
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(40878,) (40878,)
(10220,) (10220,)


In [6]:
# import bert preprocessor and encoder layer from kaggle
preprocessor = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3")

encoder = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-l-12-h-768-a-12/versions/4",
    trainable=False)

Attaching model 'tensorflow/bert/tensorflow2/en-uncased-preprocess/3' to your Kaggle notebook...
Attaching model 'tensorflow/bert/tensorflow2/en-uncased-l-12-h-768-a-12/4' to your Kaggle notebook...


In [7]:
# preprocess and encode the text input data
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
encoder_inputs = preprocessor(text_input)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]

In [8]:
# Neural network layer
f = tf.keras.layers.Dropout(0.2,name='dropout')(pooled_output)
f = tf.keras.layers.Dense(128,activation='relu',name='hidden')(f)
f = tf.keras.layers.Dense(64,activation='relu',name='hiddenn')(f)
f = tf.keras.layers.Dense(8, activation='relu',name='hiddennn')(f)
f = tf.keras.layers.Dense(1,activation='sigmoid',name='output')(f)

In [9]:
# create the model
model=tf.keras.Model(inputs=[text_input],outputs=[f])

In [10]:
# convert the dataset to list to load in model
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [11]:
# compiling the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [12]:
# loading the train data and training the model
model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ad1d1f7e6e0>

In [13]:
# evaluating the accracy and loss of the model
bert_loss, bert_accuracy = model.evaluate(X_test, y_test)

# testing the model on test dataset
y_predict=model.predict(X_test)
# y_pred = np.where(y_predict>0.5,1,0) 
y_pred = (y_predict > 0.5).astype(int)

print(f"\nBERT Accuracy: {bert_accuracy}")
print(f"\nBERT Loss: {bert_loss}")
print(f"\nBERT Classification Report: ")

# classification report of the model on test dataset
classification_rep = classification_report(y_test, y_pred)
print(classification_rep)


BERT Accuracy: 0.961154580116272

BERT Loss: 0.10389368236064911

BERT Classification Report: 
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      5110
           1       0.98      0.94      0.96      5110

    accuracy                           0.96     10220
   macro avg       0.96      0.96      0.96     10220
weighted avg       0.96      0.96      0.96     10220

