In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nlp-getting-started:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F17777%2F869809%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240705%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240705T071439Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0cdf74b9f0f329282192713a25441027daf5c70a5bdb032286b65162448f0311b729c990d33cd9740762875b82ea0fd9bc39553f7a26181a71e5fef2c68f88c12066e10300f16b806ea2fd30c383c2f4ff8d0fd131e7b56eb3cb6f874301e2d5f53703b2c7803eb5bf1d64bba8e42314c684906bae6a9991975284076fb04f9c7dd3fb48d7e71919479a6138339279fd509c4dbdedcd7bde550ae13718ec240425a2864d8744ee50ba42efd42c41cd2707ec881f856aa8ac5c1705015f6a345a1003b19bd2a1f6cd4b058703d593a1d65212127a61d21d930ee07de8eed01a586a142ce1459f9dbb45585e1104ce34e25818a58b960976d4469fdd6e35fa97dd,distil_bert/keras/distil_bert_base_en_uncased/2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F4689%2F6068%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240705%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240705T071440Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D17be917872b1b9a5e4fd45243a741bcdf3fa0f7e39d524a8cd6a5c7bffa64eeef29e8cd30cd887e6d55ec232bfb22d474a3f138079a8ca3300af651bc955d88d063c1ecfd86247dfbecbfb2fd392fba1910671701fd524508dc6591442343493920f0b00a4ce2b5b30a0e439cbc1f9eff1d84645f76aa3c45196ce5198a95a16891d471d7b6ae7ff2c8178dd339307f6e4e1c92975d6ab15e323194449ba3362fe4b0d0fa10d2d5019ce5bd1389271ba7448b43f2bd5025e372ca9851efb5b57990279a6fe212667760f4fe456217db64cf807d28b80a3f3629987c1d2979e24193d7a559d4199d7b6463f79f21ea2a72c1fb25db3e14f019c3fbcbe3c3c690f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

# This sample uses Keras Core, the multi-backend version of Keras.
# The selected backend is TensorFlow (other supported backends are 'jax' and 'torch')
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [None]:
# Import function libraries
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [None]:
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")


### Basic information about the data

In [None]:
print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))


In [None]:
# Visualize the target distribution
sns.countplot(x='target', data=df_train)
plt.title('Distribution of Target Variable')
plt.show()

In [None]:
# Visualize the length of tweets
df_train['length'] = df_train['text'].apply(len)
df_test['length'] = df_test['text'].apply(len)

In [None]:
sns.histplot(df_train['length'], bins=50, kde=True)
plt.title('Distribution of Tweet Lengths in Training Set')
plt.show()

In [None]:
sns.histplot(df_test['length'], bins=50, kde=True)
plt.title('Distribution of Tweet Lengths in Test Set')
plt.show()

### Data Preprocessing

In [None]:
# Pre-process the data
batch_size = 32
num_training_examples = df_train.shape[0]
train_split = 0.8
val_split = 0.2
steps_per_epoch = int(num_training_examples) * train_split // batch_size

epochs = 2
auto = tf.data.experimental.AUTOTUNE

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X = df_train["text"]
y = df_train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_split, random_state=42)

X_test = df_test["text"]

### DistilBERT model from KerasNLP

In [None]:
# Load a DistilBERT model from KerasNLP
preset = "distil_bert_base_en_uncased"

# Assign shorter sequence length
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
    preset,
    sequence_length=160,
    name="preprocessor_4_tweets"
)

# Assign pre-trained classifier
classifier = keras_nlp.models.DistilBertClassifier.from_preset(
    preset,
    preprocessor=preprocessor,
    num_classes=2
)

classifier.summary()

### Model Training

In [None]:
# 1st step: Compile
classifier.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(1e-5),
    metrics=["accuracy"]
)

# 2nd step: Fit
history = classifier.fit(
    x=X_train,
    y=y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_val, y_val)
)

In [None]:
# Print the accuracy result
# Extracting the training and validation accuracy from the history object
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Printing the final training and validation accuracy
print(f"Final Training Accuracy: {train_accuracy[-1]:.4f}")
print(f"Final Validation Accuracy: {val_accuracy[-1]:.4f}")

In [None]:
# Define confusion matrix function
def displayConfusionMatrix(y_true, y_pred, dataset):
    disp = ConfusionMatrixDisplay.from_predictions(
        y_true,
        np.argmax(y_pred, axis=1),
        display_labels=["Non-disaster", "Disaster"],
        cmap=plt.cm.Purples
    )

    tn, fp, fn, tp = confusion_matrix(y_true, np.argmax(y_pred, axis=1)).ravel()

    f1_score = tp / (tp + ((fn + fp) / 2))

    disp.ax_.set_title("Confusion matrix on " + dataset + " Dataset -- F1 Score: " + str(f1_score.round(2)))
    plt.show()  # Ensure the plot is displayed

In [None]:
y_pred_train = classifier.predict(X_train)

displayConfusionMatrix(y_train, y_pred_train, "Training")

In [None]:
y_pred_val = classifier.predict(X_val)

displayConfusionMatrix(y_val, y_pred_val, "Validation")

### Submission File

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

sample_submission.head()

In [None]:
sample_submission["target"] = np.argmax(classifier.predict(X_test), axis=1)



In [None]:
sample_submission.to_csv("submission.csv", index=False)