<a href="https://colab.research.google.com/github/Prasad-py/Twitter_Sentiment_Detection/blob/main/Twitter_sentiment_analysis_BERT_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.utils import shuffle

In [None]:
!pip install tensorflow_text
!pip install sentencepiece

In [None]:
tf.version

In [None]:
!wget  https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import tokenization

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/training_data (2).csv",  encoding='latin-1')
# test_data = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/testdata.manual.2009.06.14.csv")

In [None]:
train_data.sample()

In [None]:
train_data = train_data[['Sentiment','text' ]]
# test_data = test_data[['Sentiment','text']]
train_data['senti']=train_data['Sentiment'].apply(lambda x: 1 if x==4 else 0)
# test_data['senti']=test_data['Sentiment'].apply(lambda x: 1 if x==4 else 0)
train_data

In [None]:
train_labels = train_data['senti'].tolist()
train_text = train_data['text'].tolist()
# test_labels = test_data['Sentiment'].tolist()
# test_text = test_data['text'].tolist()
train_labels,train_text = shuffle(train_labels,train_text)
train_text = train_text[1:3500]
train_labels = train_labels[1:3500]
train_text
del train_data
# del test_data

In [None]:
train_input = bert_encode(train_text, tokenizer, max_len=160)
# test_input = bert_encode(test_text, tokenizer, max_len=160)
train_input[0]

In [None]:
model = build_model(bert_layer, max_len=160)
model.summary()
# model.add(Dropout(0.2))


In [None]:
# tf.convert_to_tensor(
#     train_text, dtype=None, dtype_hint=None, name=None
# )

# train_labels_new = np.asarray(train_labels)
# train_text_new = np.asarray(train_input)
# train_text_neww = np.resize(train_text_new , (3, 4999, 160))
# train_text_neww[0]
train_text_new = np.asarray(train_input)
train_text_new_1 = np.resize(train_text_new , (4999, 3, 160))
train_input_new = []
for (a,b,c) in train_text_new_1:
    train_input_new.append([tf.convert_to_tensor(np.asarray(a)), tf.convert_to_tensor(np.asarray(b)), tf.convert_to_tensor(np.asarray(c))])
train_input_new = np.asarray(train_input_new)
# train

In [None]:
# Lemmatization & Stemming
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
pst=PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
train_history = model.fit(
    [np.asarray(train_input[0]),np.asarray(train_input[1]),np.asarray(train_input[2])], np.asarray(train_labels),
    validation_split=0.3,
    epochs=5,
    batch_size=16,
    verbose=1
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model.summary()
plt.plot(train_history.history['acc'])
plt.plot(train_history.history['val_acc'])

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(train_input[1], train_history)

In [None]:
model.predict()