In [1]:
%pip install keras-core --upgrade
%pip install -q keras-nlp --upgrade

# This sample uses Keras Core, the multi-backend version of Keras.
# The selected backend is TensorFlow (other supported backends are 'jax' and 'torch')
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

Collecting keras-core
  Using cached keras_core-0.1.5-py3-none-any.whl.metadata (4.0 kB)
Collecting absl-py (from keras-core)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting numpy (from keras-core)
  Using cached numpy-1.24.4-cp38-cp38-win_amd64.whl.metadata (5.6 kB)
Collecting rich (from keras-core)
  Using cached rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting namex (from keras-core)
  Using cached namex-0.0.8-py3-none-any.whl.metadata (246 bytes)
Collecting h5py (from keras-core)
  Using cached h5py-3.11.0-cp38-cp38-win_amd64.whl.metadata (2.5 kB)
Collecting dm-tree (from keras-core)
  Using cached dm_tree-0.1.8-cp38-cp38-win_amd64.whl.metadata (2.0 kB)
Collecting markdown-it-py>=2.2.0 (from rich->keras-core)
  Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich->keras-core)
  Using cached mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)
Using cached keras_core-0.1.5-py3-none-a

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

TensorFlow version: 2.10.1
KerasNLP version: 0.6.1


In [124]:
# df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
# df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

from sklearn.model_selection import train_test_split
df = pd.read_csv("Truth_Seeker_Model_Dataset.csv")

sentences = df['target'].astype(str) + 'Statement: ' + df['statement'] + '| Tweet: ' + df['tweet']
labels = df["BinaryNumTarget"].values

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

train_sentences = df_train['target'].astype(str) + 'Statement: ' + df_train['statement'] + '| Tweet: ' + df_train['tweet']
train_labels = df_train["BinaryNumTarget"].values

# Prepare test sentences and labels
test_sentences = df_test['target'].astype(str) + 'Statement: ' + df_test['statement'] + '| Tweet: ' + df_test['tweet']
test_labels = df_test["BinaryNumTarget"].values

df2_test = df_test.copy()
df_test = df_test.drop(['5_label_majority_answer', '3_label_majority_answer'], axis=1) 

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

Training Set Shape = (107358, 9)
Training Set Memory Usage = 7.47 MB
Test Set Shape = (26840, 7)
Test Set Memory Usage = 1.46 MB


In [125]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer
26587,26587,Miriam Valverde,"""We've had Muslims in America since George Was...",True,1.0,"Muslims, America, George Washington",OMG radical Islamic. That's disgusting. Muslim...,Mostly Disagree,Disagree
36010,36010,Louis Jacobson,"""According to the Centers for Disease Control ...",True,1.0,"CDC, overdose",@joshdcaplan Each of Drug overdose deaths were...,Agree,Agree
80031,80031,Clara Hendrickson,"""Michigan has requested a full forensic audit.""",False,0.0,"Michigan,audit,forensic",@MIAttyGen I'm looking forward to a forensic a...,Agree,Agree
130086,130086,Tom Kertscher,"""Black Lives Matter is a terrorist organization.""",False,0.0,"Black Lives Matter, terrorist organization",@hoosier_patrick @SteveHinnefeld 99% of people...,Agree,Agree
54842,54842,Gabrielle Settles,There were no guns whatsoever at the Capitol r...,False,0.0,capitol gun jan,"""As the pro-Trump mob seized Mike Fanone, a D....",NO MAJORITY,Disagree


In [126]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet
122451,122451,Daniel Funke,At 17 years old Kyle (Rittenhouse) was perfect...,False,0.0,"Kyle,possess,rifle,legal",@fattycattie @robquinnpc @SammyTMcCarty @WCCO ...
14029,14029,Jill Terreri,"""I actually in 2005 ran on Medicare for all.""",True,1.0,"2005, ran on Medicare","never forget, Matt Santos ran on Medicare for ..."
51449,51449,Miriam Valverde,COVID-19 vaccinations are a violation of the N...,False,0.0,"vaccine,Nuremberg code,viloation",@BilldeBlasio You are mandating an experimenta...
17981,17981,Manuela Tobias,"""You know what Amazon paid in federal income t...",True,1.0,"Amazon, federal income taxes, zero",@SenSanders The working class is literally bei...
66284,66284,Julie Kliegman,"""Crimea became part of Ukraine only in 1954. C...",True,1.0,"Crimea, historically, Russia",@PeterTong9 @timand2037 @ClimateAudit But none...


In [127]:
def generate_truthfulness_4way(row):
    if row['target'] == True:
        if row['5_label_majority_answer'] == 'Agree':
            return "True"
        elif row['5_label_majority_answer'] == 'Disagree':
            return "False"
        elif row['5_label_majority_answer'] == 'Mostly Agree':
            return "Mostly True"
        elif row['5_label_majority_answer'] == 'Mostly Disagree':
            return "Mostly False"
    else:
        if row['5_label_majority_answer'] == 'Agree':
            return "False"
        elif row['5_label_majority_answer'] == 'Disagree':
            return "True"
        elif row['5_label_majority_answer'] == 'Mostly Agree':
            return "Mostly False"
        elif row['5_label_majority_answer'] == 'Mostly Disagree':
            return "Mostly True"

def generate_truthfulness_2way(row):
    if row['target'] == True:
        if row['3_label_majority_answer'] == 'Agree':
            return "True"
        elif row['3_label_majority_answer'] == 'Disagree':
            return "False"
    else:
        if row['3_label_majority_answer'] == 'Agree':
            return "False"
        elif row['3_label_majority_answer'] == 'Disagree':
            return "True"

In [None]:
df2_train = df_train.copy()
# df2_train['4-way-label'] = df2_train.apply(lambda x: generate_truthfulness_4way(x), axis=1)
# df2_train['4-way-label'] = df2_train['4-way-label'].replace({'True': 0, 'False': 1, 'Mostly True': 2, 'Mostly False': 3})
df2_train['2-way-label'] = df2_train.apply(lambda x: generate_truthfulness_2way(x), axis=1)
df2_train['2-way-label'] = df2_train['2-way-label'].replace({'True': 0, 'False': 1})


In [129]:
df_train = df2_train

In [130]:
df_train = df_train.drop(['5_label_majority_answer', '3_label_majority_answer'], axis=1) 

In [131]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,2-way-label
26587,26587,Miriam Valverde,"""We've had Muslims in America since George Was...",True,1.0,"Muslims, America, George Washington",OMG radical Islamic. That's disgusting. Muslim...,1
36010,36010,Louis Jacobson,"""According to the Centers for Disease Control ...",True,1.0,"CDC, overdose",@joshdcaplan Each of Drug overdose deaths were...,0
80031,80031,Clara Hendrickson,"""Michigan has requested a full forensic audit.""",False,0.0,"Michigan,audit,forensic",@MIAttyGen I'm looking forward to a forensic a...,1
130086,130086,Tom Kertscher,"""Black Lives Matter is a terrorist organization.""",False,0.0,"Black Lives Matter, terrorist organization",@hoosier_patrick @SteveHinnefeld 99% of people...,1
54842,54842,Gabrielle Settles,There were no guns whatsoever at the Capitol r...,False,0.0,capitol gun jan,"""As the pro-Trump mob seized Mike Fanone, a D....",0


In [132]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet
122451,122451,Daniel Funke,At 17 years old Kyle (Rittenhouse) was perfect...,False,0.0,"Kyle,possess,rifle,legal",@fattycattie @robquinnpc @SammyTMcCarty @WCCO ...
14029,14029,Jill Terreri,"""I actually in 2005 ran on Medicare for all.""",True,1.0,"2005, ran on Medicare","never forget, Matt Santos ran on Medicare for ..."
51449,51449,Miriam Valverde,COVID-19 vaccinations are a violation of the N...,False,0.0,"vaccine,Nuremberg code,viloation",@BilldeBlasio You are mandating an experimenta...
17981,17981,Manuela Tobias,"""You know what Amazon paid in federal income t...",True,1.0,"Amazon, federal income taxes, zero",@SenSanders The working class is literally bei...
66284,66284,Julie Kliegman,"""Crimea became part of Ukraine only in 1954. C...",True,1.0,"Crimea, historically, Russia",@PeterTong9 @timand2037 @ClimateAudit But none...


In [133]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
MAX_SENTENCE_LENGTH = 410

Loading BERT tokenizer...


In [134]:
BATCH_SIZE = 40
NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE

EPOCHS = 2
AUTO = tf.data.experimental.AUTOTUNE

In [135]:
from sklearn.model_selection import train_test_split


X = df_train["target"].astype(str) + ' Statement: ' + df_train['statement'] + ' | Tweet: ' + df_train['tweet']
y = df_train["2-way-label"]
# y = df_train["4-way-label"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

X_test = df_test["target"].astype(str) + ' Statement: ' + df_test['statement'] + ' | Tweet: ' + df_test['tweet']
y_test = df_test["BinaryNumTarget"]

In [136]:
# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)

classifier.summary()

# Train your own model, fine-tuning BERT

In [138]:
import tensorflow as tf
from tensorflow import keras

# Compile the model
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), 
    metrics=["accuracy"]
)

# Fit the model
history = classifier.fit(
    x=X_train,
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS, 
    validation_data=(X_val, y_val)
)

Epoch 1/2
  46/2148 [..............................] - ETA: 9:46:32 - loss: 0.2023 - accuracy: 0.9495

KeyboardInterrupt: 

In [None]:
def displayConfusionMatrix(y_true, y_pred, dataset):
    disp = ConfusionMatrixDisplay.from_predictions(
        y_true,
        np.argmax(y_pred, axis=1),
        display_labels=["Not Disaster","Disaster"],
        cmap=plt.cm.Blues
    )

    tn, fp, fn, tp = confusion_matrix(y_true, np.argmax(y_pred, axis=1)).ravel()
    f1_score = tp / (tp+((fn+fp)/2))

    disp.ax_.set_title("Confusion Matrix on " + dataset + " Dataset -- F1 Score: " + str(f1_score.round(2)))

In [None]:
y_pred_train = classifier.predict(X_train)

displayConfusionMatrix(y_train, y_pred_train, "Training")

In [None]:
y_pred_val = classifier.predict(X_val)

displayConfusionMatrix(y_val, y_pred_val, "Validation")

# Generate the submission file 

For each tweets in the test set, we predict if the given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

The `submission.csv` file uses the following format:
`id,target`

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.head()

In [None]:
sample_submission["target"] = np.argmax(classifier.predict(X_test), axis=1)

In [None]:
sample_submission.describe()

In [None]:
sample_submission.to_csv("submission.csv", index=False)