In [None]:
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

In [None]:
import re
import pickle
import numpy as np
import pandas as pd
# Plot libraries
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Torch ML libraries
import tensorflow as tf
from transformers import AutoTokenizer

# Keras 
from sklearn.model_selection import train_test_split



In [None]:
dataset =  pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv'
                      ,encoding = DATASET_ENCODING , names= DATASET_COLUMNS)

In [None]:
dataset = dataset[['sentiment' , 'text']]
dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)
dataset = dataset.head(100000)
label_counts = dataset['sentiment'].value_counts()
print(label_counts)

In [None]:
def change_label(value): 
    if value == 4: 
        return 1
    else: 
        return value

In [None]:
# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"


def preprocess_apply(tweet):

    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

In [None]:
dataset['processed_text'] = dataset.text.apply(preprocess_apply)
dataset['sentiment'] = dataset['sentiment'].apply(change_label)

In [None]:
dataset['word_count'] = dataset['processed_text'].apply(lambda x: len(str(x).split()))

# Calculate the statistics
plt.figure(figsize=(8, 6))
sns.violinplot(x=dataset['word_count'])
plt.title('Violin Plot of Word Count per Tweet')
plt.xlabel('Word Count')
plt.show()

In [None]:
dataset['sentiment'].nunique()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

In [None]:
texts = list(dataset['processed_text'])
labels = list(dataset['sentiment'])

In [None]:
encoded_inputs = tokenizer(
    texts,
    max_length=40,            # Set maximum sequence length to 40 tokens
    padding='max_length',     # Pad sequences to the maximum length
    truncation=True,          # Truncate sequences longer than the maximum length
    return_tensors='np'       # Return NumPy arrays (use 'tf' for TensorFlow tensors directly)
)


In [None]:
input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']

In [None]:
print("Input IDs:", input_ids[0])
print("Attention Mask:", attention_mask[0])

In [None]:
df = pd.DataFrame({
    'input_ids': list(input_ids),
    'attention_mask': list(attention_mask),
    'labels': labels  # Ensure this is a list or array
})

In [None]:
df_train, df_temp = train_test_split(df, test_size=0.2, random_state=42, shuffle = True)
# Further split temp into validation and test sets
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

In [None]:
def create_tf_dataset_from_df(df, batch_size=32):
    input_ids = tf.convert_to_tensor(np.array(df['input_ids'].tolist()), dtype=tf.int32)
    attention_mask = tf.convert_to_tensor(np.array(df['attention_mask'].tolist()), dtype=tf.int32)
    labels = tf.convert_to_tensor(np.array(df['labels']), dtype=tf.int32)

    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        },
        labels
    ))
    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
train_dataset_tf = create_tf_dataset_from_df(df_train, batch_size=32)
val_dataset_tf = create_tf_dataset_from_df(df_val, batch_size=32)
test_dataset_tf = create_tf_dataset_from_df(df_test , batch_size = 32)

In [None]:
from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Step 5: Model Training
history = model.fit(
    train_dataset_tf,
    epochs=2,
    validation_data=val_dataset_tf
)

In [None]:
test_loss, test_accuracy = model.evaluate(test_dataset_tf)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Extract actual labels from the test dataset
actual_labels = []
for _, labels in test_dataset_tf:
    actual_labels.extend(labels.numpy())

# Convert actual labels to a NumPy array
actual_labels = np.array(actual_labels)

# Step 2: Generate predictions from the model
predictions = model.predict(test_dataset_tf)
predicted_labels = np.argmax(predictions.logits, axis=-1)

# Step 3: Calculate the confusion matrix
conf_matrix = confusion_matrix(actual_labels, predicted_labels)

# Step 4: Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Step 5: Print the classification report
class_report = classification_report(actual_labels, predicted_labels, target_names=['Class 0', 'Class 1'])
print("Classification Report:")
print(class_report)

In [None]:
model.save_pretrained("./sentiment_bert")

In [None]:
load = TFDistilBertForSequenceClassification.from_pretrained('/kaggle/working/sentiment_bert')

In [None]:
sentence = 'Great, another monday...'
encoded_sent = tokenizer(
    sentence,
    max_length=40,            # Set maximum sequence length to 40 tokens
    padding='max_length',     # Pad sequences to the maximum length
    truncation=True,          # Truncate sequences longer than the maximum length
    return_tensors='np'       # Return NumPy arrays (use 'tf' for TensorFlow tensors directly)
)
tf_output = load.predict(encoded_sent)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
tf_prediction

As you can see, the model still was able to predict the sentence with sarcasm to a correct class which was 0 which indicates the negative sentiment.  