In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split



In [2]:
traindata = pd.read_csv('data/train.csv')
testdata = pd.read_csv('data/test.csv')

In [5]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and format the training data
# We use the raw 'text' column
X_train_tokens = tokenizer(
    list(traindata['text']),
    max_length=128,  # Max length of a tweet
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

# Tokenize and format the test data
X_test_tokens = tokenizer(
    list(testdata['text']),
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

# Prepare the labels
y_train = traindata['target'].values

ImportError: Unable to convert output to TensorFlow tensors format, TensorFlow is not installed.

In [None]:
# Load the pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", from_pt=True)


# Compile the model with a low learning rate, which is crucial for fine-tuning
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
# Fine-tune the model
# batch_size=16 is a good starting point for BERT on a standard GPU
history = model.fit(
    {'input_ids': X_train_tokens['input_ids'], 'attention_mask': X_train_tokens['attention_mask']},
    y_train,
    batch_size=16,
    epochs=2,
    validation_split=0.1  # Use 10% of data for validation during training
)

In [None]:
# Make predictions on the test set
test_predictions = model.predict(
    {'input_ids': X_test_tokens['input_ids'], 'attention_mask': X_test_tokens['attention_mask']}
)

# The output is in 'logits', so we need to find the class with the highest score
final_predictions = np.argmax(test_predictions.logits, axis=1)

# Create and save the submission file
submission_bert = pd.DataFrame({
    'id': testdata['id'],
    'target': final_predictions
})
submission_bert.to_csv('submission_bert.csv', index=False)

print("\nSubmission file 'submission_bert.csv' created successfully!")
print(submission_bert.head())