In [1]:
import tensorflow as tf

# Check if TensorFlow is using the GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Get the name of the GPU
if tf.config.experimental.list_physical_devices('GPU'):
    print(tf.config.experimental.list_physical_devices('GPU'))
else:
    print("No GPU available.")

Num GPUs Available:  1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
!wget https://github.com/TheZarif/disaster-tweets/raw/main/nlp-getting-started.zip

--2023-11-29 01:08:50--  https://github.com/TheZarif/disaster-tweets/raw/main/nlp-getting-started.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/TheZarif/disaster-tweets/main/nlp-getting-started.zip [following]
--2023-11-29 01:08:50--  https://raw.githubusercontent.com/TheZarif/disaster-tweets/main/nlp-getting-started.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp-getting-started.zip’


2023-11-29 01:08:50 (29.6 MB/s) - ‘nlp-getting-started.zip’ saved [607343/607343]



In [3]:
!unzip nlp-getting-started.zip -d data

Archive:  nlp-getting-started.zip
  inflating: data/sample_submission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          


In [4]:
import pandas as pd

df_train = pd.read_csv("/content/data/train.csv")
df_test = pd.read_csv("/content/data/test.csv")

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test Set Shape = (3263, 4)
Test Set Memory Usage = 0.10 MB


In [7]:
VAL_SPLIT=0.2

In [8]:
from sklearn.model_selection import train_test_split

X = df_train["text"]
y = df_train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

X_test = df_test["text"]

In [13]:
def preprocess(text):
    preprocessed_text = []
    for t in text.split():
        if len(t) > 1:
            t = '@user' if t[0] == '@' and t.count('@') == 1 else t
            t = 'http' if t.startswith('http') else t
        preprocessed_text.append(t)
    return ' '.join(preprocessed_text)

X_train = X_train['text'].apply(preprocess)
X_val = X_val['text'].apply(preprocess)
X_test = X_test.apply(preprocess)

In [24]:
# prompt: type of X_train.to_list()[0]

print(type(X_train.to_list()[0]))


<class 'str'>


In [44]:
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

# Load tokenizer and model
MODEL = "cardiffnlp/twitter-roberta-base-2021-124m"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2, from_pt=True)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [45]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # or 'val_accuracy'
    patience=1,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Whether to restore model weights from the epoch with the best value of the monitored quantity
)

In [46]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [35]:
# Tokenize the data
encoded_inputs_dummy = tokenizer(X_train.head(20).to_list(), padding=True, truncation=True, return_tensors="tf")
encoded_inputs = tokenizer(X_train.to_list(), padding=True, truncation=True, return_tensors="tf")
encoded_inputs_val = tokenizer(X_val.to_list(), padding=True, truncation=True, return_tensors="tf")
encoded_inputs_test = tokenizer(X_test.to_list(), padding=True, truncation=True, return_tensors="tf")

# Extract input_ids and attention_mask
X_train_process_dum = encoded_inputs_dummy["input_ids"]
attention_mask_dum = encoded_inputs_dummy["attention_mask"]

X_train_process = encoded_inputs["input_ids"]
attention_mask = encoded_inputs["attention_mask"]

X_val_process = encoded_inputs_val["input_ids"]
attention_mask_val = encoded_inputs_val["attention_mask"]

X_test_process = encoded_inputs_test["input_ids"]
attention_mask_test = encoded_inputs_test["attention_mask"]

# If you have labels, ensure they are also tensors
y_train_process_dum = tf.constant(y_train.head(20).to_list())
y_train_process = tf.constant(y_train.to_list())
y_val_process = tf.constant(y_val.to_list())

In [47]:
history = model.fit(
    x={"input_ids": X_train_process, "attention_mask": attention_mask},
    y=y_train_process,
    batch_size=32,
    epochs=10,
    validation_data=({"input_ids": X_val_process, "attention_mask": attention_mask_val},
                     y_val_process),
    callbacks=[early_stopping_callback]
)

Epoch 1/10
Epoch 2/10


In [48]:
import numpy as np

predictions = model.predict({'input_ids': X_test_process, 'attention_mask': attention_mask_test})
probabilities = tf.nn.softmax(predictions.logits, axis=-1)
predicted_labels = np.argmax(probabilities, axis=1)



In [51]:
df_test['target'] = predicted_labels

In [57]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [58]:
df_test[['id', 'target']].to_csv('/content/drive/My Drive/disaster-tweets/twitter-roberta.csv', index=False)