# **Imports**

In [None]:
!pip install -q keras-core --upgrade
!pip install -q keras-nlp --upgrade
!pip install -q tensorflow-text

In [None]:
!pip install -q contractions

In [None]:
import contractions
from textblob import TextBlob

In [None]:
# This sample uses Keras Core, the multi-backend version of Keras.
# The selected backend is TensorFlow (other supported backends are 'jax' and 'torch')
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [None]:
!pip install -q --upgrade keras

You might have to restart the kernel here.

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split, KFold

import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, FileLink

from tqdm import tqdm
tqdm.pandas()

In [None]:
import tensorflow as tf
import keras_core as keras
import keras_nlp

from tensorflow.keras.callbacks import EarlyStopping


# **Load the data**

In [None]:
# Directories
data_dir = '/kaggle/input/nlp-getting-started/'

In [None]:
train_df = pd.read_csv(data_dir + "train.csv")
test_df = pd.read_csv(data_dir + "test.csv")

print('Training Set Size:', format(len(train_df)))
print('Test Set Size:', format(len(test_df)))

In [None]:
print(len(train_df))
print(len(train_df[train_df['keyword'].notna()]))

In [None]:
test_df.head()

# **Clean the data**

In [None]:
# Check for '%20' in the 'keyword' column
print("keyword column")
display(train_df[train_df['keyword'].str.contains('%20', na = False)].head())

# Check for '%20' in the 'text' column
print("text column")
display(train_df[train_df['text'].str.contains('%20', na = False)].head())

# Replace '%20' with ' ' in the keyword column
train_df['keyword'] = train_df['keyword'].str.replace(r'%20', ' ', regex = True)

In [None]:
# Add the keyword (if there is one) to the beginning of the text
train_df['text'] = train_df['keyword'].fillna('') + ' ' + train_df['text']

# Verify the results
train_df[train_df['keyword'].notna()].head()

In [None]:
# Remove URLs
# train_df['text'] = train_df['text'].str.replace(r'http\S+|www\S+|https\S+', '', regex=True)

In [None]:
# Remove mentions
# train_df['text'] = train_df['text'].str.replace(r'@\w+', '', regex=True)

In [None]:
# Remove hashtags
# train_df['text'] = train_df['text'].str.replace(r'#\w+', '', regex=True)

In [None]:
# # Expand contractions
# train_df['text'] = train_df['text'].apply(lambda x: contractions.fix(x))

In [None]:
# # Handle misspellings
# train_df['text'] = train_df['text'].progress_apply(lambda x: str(TextBlob(x).correct()))

In [None]:
# Simple cleaning
# Removes all characters that are not upper- or lower-case English letters, or whitespaces.
# Note, it till turn #BigStory into bigstory, i.e. it will not remove the phrase following the #.
train_df['text'] = train_df['text'].str.replace(r'[^a-zA-Z\s]', '', regex = True).str.lower()

In [None]:
# Save the cleaned data frame
display(train_df.head())
train_df.to_csv('/kaggle/working/train_df.csv', index = False)

In [None]:
# Reload the cleaned data fram
train_df = pd.read_csv("/kaggle/working/train_df.csv")
display(train_df.head())

# **Misclassified samples**

In [None]:
print(len(train_df))
mislabeled_df = train_df.groupby(['text']).nunique().sort_values(by='target', ascending=False)
mislabeled_tweets = mislabeled_df[mislabeled_df['target'] > 1]['target'].index.tolist()

train_df = train_df[~train_df['text'].isin(mislabeled_tweets)]
print(len(train_df))

# **Exploratory Data Analysis**

In [None]:
# Check if there is any clear difference in tweet-length distribution
# between the two classes
train_df['length'] = train_df['text'].apply(lambda x: len(x))

print("Tweet-length stats: class 0")
print(train_df[train_df['target'] == 0]['length'].describe())
print()


print("Tweet-length stats: class 1")
print(train_df[train_df['target'] == 1]['length'].describe())



In [None]:
train_df['target_mean'] = train_df.groupby('keyword')['target'].transform('mean')

fig = plt.figure(figsize=(8, 72), dpi=100)

sns.countplot(y=train_df.sort_values(by='target_mean', ascending=False)['keyword'],
              hue=train_df.sort_values(by='target_mean', ascending=False)['target'])

plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc=1)
plt.title('Target Distribution in Keywords')

plt.show()

train_df.drop(columns=['target_mean'], inplace=True)

# **Take a sample of the data**

To speed up early hyper-parameter tuning, we'll start by training the model on just a portion of the training set, perhaps 10%. As we continue fine-tuning the hyper-parameters, we'll increase this portion to 25%, 50%, and finally to 100%.

In [None]:
# Take just a portion of the data, for early testing
sample_fraction = 1 # Start with 0.1 and increase to 0.25, 0.5, and finally 1
train_df_to_use = train_df.sample(frac = sample_fraction, random_state = 42)

# Verify the sample
print(f'Full training set: {len(train_df)}')
print(f'Sampled training set: {len(train_df_to_use)}')

# **Split the data**

In [None]:
# TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2

# The "text" col holds the tweets; 
# the "target" col has 0 (not disaster-related) or 1 (disaster-related)
X = train_df_to_use["text"]
y = train_df_to_use["target"]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = VAL_SPLIT, random_state = 42)

# Extract the tweets from the test data
X_test = test_df["text"]

# **Model**

In [None]:
BATCH_SIZE = 16
# STEPS_PER_EPOCH = len(X_train) // BATCH_SIZE

EPOCHS = 20
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Get the max-length of the tweets
max_length = train_df['text'].apply(lambda x: len(x)).max()
print("The longest tweet is", max_length, "characters long.")

# Preprocessor
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length = max_length,
                                                                   name = "preprocessor_4_tweets"
                                                                  )

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor,
                                                               num_classes = 2)

classifier.summary()

In [None]:
# Compile
classifier.compile(
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits = True), #'binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(learning_rate = 3e-6),
    metrics= ["accuracy"]
)


# **Model training**

In [None]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience = 3, restore_best_weights = True)

# Fit
history = classifier.fit(x = X_train,
                         y = y_train,
                         batch_size = BATCH_SIZE,
                         epochs = EPOCHS,
                         validation_data = (X_val, y_val),
                         callbacks = [early_stopping]
                        )

# **Evaluation**

In [None]:
# Evaluate the model on the validation set
score, accuracy = classifier.evaluate(X_val, y_val, verbose = 2)
print(f"Validation Accuracy: {accuracy}")

# **Prediction and Confusion Matrix**

In [None]:
# Function that outputs a confusion matrix
def displayConfusionMatrix(y_true, y_pred, dataset):
    disp = ConfusionMatrixDisplay.from_predictions(
        y_true,
        np.argmax(y_pred, axis = 1),
        display_labels=["Not Disaster", "Disaster"],
        cmap=plt.cm.Blues
    )

    tn, fp, fn, tp = confusion_matrix(y_true, np.argmax(y_pred, axis=1)).ravel()
    f1_score = tp / (tp+((fn+fp)/2))

    disp.ax_.set_title("Confusion Matrix on " + dataset + " Dataset -- F1 Score: " + str(f1_score.round(2)))

In [None]:
# Predict on the validation data
y_val_pred = classifier.predict(X_val)

# Output the confusion matrix for the validation data
displayConfusionMatrix(y_val, y_val_pred, "Validation")

# **Predict on test data and generate submission file**

In [None]:
# Predict on the test data
test_pred = np.argmax(classifier.predict(X_test), axis = 1)

In [None]:
# Get the test_ids from the test data
test_ids = test_df["id"]

# Create a DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'target': test_pred
})

# Take a look at the results
print(submission.head())

# Generate a unique filename with a timestamp
timestamp = datetime.now().strftime('%Y-%m-%d_%H%M')
filename = f'dtsa-5511-m4-submission_{timestamp}.csv'

# Save the DataFrame to a CSV file
submission.to_csv(filename, index = False)

# Generate a download link.
FileLink(rf'{filename}')