In [None]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [None]:
import numpy as np
import pandas as pd
import keras_core as keras
import tensorflow as tf
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
print(tf.__version__)
print(keras_nlp.__version__)

In [None]:
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train["length"] = df_train["text"].apply(lambda x: len(x))
df_test["length"] = df_test["text"].apply(lambda x: len(x))

print("Train Length Stats")
print(df_train["length"].describe())
print()

print("Test Length Stats")
print(df_test["length"].describe())

In [None]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT//BATCH_SIZE

EPOCHS = 1
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
from sklearn.model_selection import train_test_split

X = df_train["text"]
y = df_train["target"]

X_train,X_val, y_train,y_val = train_test_split(X,y,test_size = VAL_SPLIT, random_state = 42)

X_test = df_test["text"]

In [None]:
preset = "distil_bert_base_en_uncased"
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset, sequence_length = 160, name = "preprocessor_4_tweets")
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset, preprocessor = preprocessor, num_classes = 2)
classifier.summary()

In [None]:
from keras.optimizers import Adam

classifier.compile(loss = keras.losses.SparseCategoricalCrossentropy(from_logits = True), 
                   optimizer = Adam(learning_rate=1e-5),
                   metrics = ["accuracy"])

history = classifier.fit(x = X_train,
                        y = y_train,
                        batch_size = BATCH_SIZE,
                        epochs = EPOCHS,
                        validation_data = (X_val, y_val))

In [None]:
def displayConfusionMatrix(y_true, y_pred, dataset):
    disp = ConfusionMatrixDisplay.from_predictions(
    y_true,
    np.argmax(y_pred, axis = 1),
    display_labels = ["Not Disaster", "Disaster"],
    cmap = plt.cm.Blues
    )
    
    tn,fp,fn,tp = confusion_matrix(y_true, np.argmax(y_pred, axis = 1)).ravel()
    f1_score = tp/(tp+((fn+fp)/2))
    
    disp.ax_.set_title("Confusion Matrix on" + dataset + "Dataset -- F1 Score: " +str(f1_score.round(2)))


In [None]:
y_pred_train = classifier.predict(X_train)



In [None]:
displayConfusionMatrix(y_train, y_pred_train, "Training")

In [None]:
y_pred_val = classifier.predict(X_val)


In [None]:
displayConfusionMatrix(y_val, y_pred_val, "Validation")

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.head()

In [None]:
sample_submission.describe()

In [None]:
sample_submission.to_csv("submission.csv", index = False)