In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install transformers



In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from nltk.corpus import stopwords

In [8]:
# Download stopwords if not already downloaded
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
data = pd.read_csv('/kaggle/input/dataset/dataset.csv', encoding = 'latin-1', header = None)
data = data.rename(columns = {0:'Sentiment', 1:'id',2:'datetime',3:'query',4:'user',5:'Text'})
data.shape

(1600000, 6)

In [9]:
# Preprocess the data
data["Sentiment"] = data["Sentiment"].map({0: 0, 2: 1, 4: 2})

In [10]:
# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, punctuation, and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text)

    # Tokenize the text
    words = text.split()

    # Remove common stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]

    # Join the words back into a single string
    text = " ".join(words)

    return text

In [11]:
# Apply text preprocessing to each text in the dataset
data["Text"] = data["Text"].apply(preprocess_text)

In [12]:
# Split the dataset into training and testing sets
X = data["Text"]
y = data["Sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Tokenize and prepare the data
train_encodings = tokenizer(X_train.to_list(), truncation=True, padding=True, max_length=64, return_tensors='tf')
test_encodings = tokenizer(X_test.to_list(), truncation=True, padding=True, max_length=64, return_tensors='tf')

In [16]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train.to_list()))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test.to_list()))

In [18]:
# Fine-tune the BERT model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [19]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 109,484,547
Trainable params: 109,484,547
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(train_dataset.shuffle(1000).batch(16), epochs=2, batch_size=16)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7c823b01c1f0>

In [21]:
# Evaluate the model
results = model.evaluate(test_dataset.batch(16))
print("Test loss, Test accuracy:", results)

Test loss, Test accuracy: [0.4145992696285248, 0.8150812387466431]


In [None]:
# Convert logits to predicted classes
y_pred_logits = model.predict(test_dataset.batch(16))
y_pred_classes = np.argmax(y_pred_logits.logits, axis=1)

In [29]:
y_pred_classes

array([0, 2, 2, ..., 2, 2, 0])

In [24]:
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')
accuracy = accuracy_score(y_test, y_pred_classes)

In [25]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}")

Precision: 0.815138287834419
Recall: 0.81508125
F1-Score: 0.8150674314214681
Accuracy: 0.81508125


In [43]:
model.save('/kaggle/working/saved_models/bert')

In [44]:
import shutil

# Replace 'folder_to_download' with the path to your folder
folder_path = '/saved_models'
shutil.make_archive("/kaggle/working/saved_models/bert", 'zip', folder_path)

'/kaggle/working/saved_models/bert.zip'

In [39]:
# Apply text preprocessing to the input text
input_text = "This is a bad product! I hate it."
preprocessed_input = preprocess_text(input_text)

In [40]:
# Tokenize and prepare the input data
input_encodings = tokenizer(preprocessed_input, truncation=True, padding=True, max_length=64, return_tensors='tf')

In [41]:
# Convert BatchEncoding to dictionary
input_dict = {key: input_encodings[key] for key in input_encodings}

# Make a prediction
input_logits = model.predict(input_dict)
input_class = np.argmax(input_logits.logits, axis=1)



In [42]:
# Convert the predicted class to sentiment
sentiment_labels = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_sentiment = sentiment_labels[input_class[0]]

print(f"Predicted Sentiment for the input text: {predicted_sentiment}")

Predicted Sentiment for the input text: Negative
