In [1]:
import pandas as pd
import tensorflow as tf # Import tensorflow

# Load the datasets
try:
    train_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
    test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')
    sample_submission_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv')

    print("Data loaded successfully!")
    print("\nTraining data head (showing 'body' as the text column):")
    print(train_df[['row_id', 'body', 'rule_violation']].head())
    print("\nTraining data info:")
    train_df.info()

    print("\nTest data head (showing 'body' as the text column):")
    print(test_df[['row_id', 'body']].head())
    print("\nTest data info:")
    test_df.info()

    print("\nSample Submission head:")
    print(sample_submission_df.head())

except FileNotFoundError:
    print("Error: Make sure 'train.csv', 'test.csv', and 'sample_submission.csv' are in the same directory.")
except Exception as e:
    print(f"An error occurred while loading data: {e}")

# Verify GPU availability
print("\nChecking for GPU availability:")
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
if len(tf.config.experimental.list_physical_devices('GPU')) > 0:
    print("GPU is available and will be used.")
else:
    print("No GPU available. Training will be slow on CPU.")



2025-07-24 01:12:32.577447: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753319552.859512      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753319552.944269      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Data loaded successfully!

Training data head (showing 'body' as the text column):
   row_id                                               body  rule_violation
0       0  Banks don't want you to know this! Click here ...               0
1       1  SD Stream [ ENG Link 1] (http://www.sportsstre...               0
2       2  Lol. Try appealing the ban and say you won't d...               1
3       3  she will come your home open her legs with  an...               1
4       4  code free tyrande --->>> [Imgur](http://i.imgu...               1

Training data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2029 entries, 0 to 2028
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              2029 non-null   int64 
 1   body                2029 non-null   object
 2   rule                2029 non-null   object
 3   subreddit           2029 non-null   object
 4   positive_example_1  2029 non-null  

2025-07-24 01:12:49.600335: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Feature Extraction(for Transformers)

In [2]:
from transformers import AutoTokenizer

# Choose a pre-trained transformer model
# Good options: 'distilbert-base-uncased', 'bert-base-uncased', 'roberta-base'
# 'distilbert-base-uncased' is a good balance of speed and performance for a start.
model_name = 'distilbert-base-uncased'
max_length = 128 # Max sequence length for the transformer. Adjust based on average comment length.

# Load the tokenizer
print(f"Loading tokenizer for {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Tokenizer loaded.")

# Prepare training data inputs
print("Tokenizing training data...")
train_encodings = tokenizer(
    train_df['body'].tolist(),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='tf'
)
y_train = train_df['rule_violation'].values # Get labels as a NumPy array

# Prepare test data inputs
print("Tokenizing test data...")
test_encodings = tokenizer(
    test_df['body'].tolist(),
    truncation=True,
    padding='max_length',
    max_length=max_length,
    return_tensors='tf'
)

# Create TensorFlow Datasets for efficient training and prediction
# Inputs are dicts containing 'input_ids', 'attention_mask', (and sometimes 'token_type_ids')
# Labels are y_train for the training dataset
BATCH_SIZE = 16 # Batch size for training. Can be 16, 32, 64 etc. Adjust based on GPU memory.

print("\nCreating TensorFlow Datasets...")
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(1000).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_encodings)).batch(BATCH_SIZE)

print(f"Train dataset created with batch size: {BATCH_SIZE}")
print("Test dataset created.")

print("\nTransformer-specific feature extraction complete.")


Loading tokenizer for distilbert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded.
Tokenizing training data...
Tokenizing test data...

Creating TensorFlow Datasets...
Train dataset created with batch size: 16
Test dataset created.

Transformer-specific feature extraction complete.


Transfomer Model Definition and Fine-Tuning

In [3]:
from transformers import TFAutoModelForSequenceClassification
import tensorflow as tf # Ensure tensorflow is imported

# Load the pre-trained model with a classification head
# For binary classification, num_labels=1 for sigmoid output
print(f"Loading pre-trained model for fine-tuning: {model_name}...")
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, from_pt=False)
print("Model loaded.")

# Define optimizer (critical for transformers)
# Use a very low learning rate for fine-tuning
learning_rate = 5e-5 # Common learning rate for transformer fine-tuning
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Define loss function and metrics
# from_logits=True if the model output is logits (raw scores), False if it's probabilities (sigmoid activated)
# TFAutoModelForSequenceClassification typically outputs logits, so we use from_logits=True
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.AUC(name='auc')] # AUC is the competition metric

# Compile the model
print("Compiling the model...")
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
model.summary()
print("Model compiled.")

# Fine-tune the model
epochs = 3 # Number of epochs. 2-5 is common for fine-tuning transformers.
print(f"\nFine-tuning the model for {epochs} epochs...")

# For validation during training, you would split train_dataset into train_subset and val_subset.
# For simplicity and direct training for submission, we'll train on the full 'train_dataset'.
# In a real scenario, you'd use a validation split or K-fold cross-validation
# to monitor overfitting and pick the best epoch.

history = model.fit(
    train_dataset,
    epochs=epochs,
    # validation_data=val_dataset, # Uncomment if you create a validation_dataset
    verbose=1
)
print("Model fine-tuning complete.")

# Optional: You can plot training history (loss and AUC) if you had a validation set
# (requires modifications to 'model.fit' to include validation_data)
# import matplotlib.pyplot as plt
# if 'val_loss' in history.history: # Check if validation data was used
#     plt.figure(figsize=(12, 5))
#     plt.subplot(1, 2, 1)
#     plt.plot(history.history['loss'], label='Train Loss')
#     plt.plot(history.history['val_loss'], label='Validation Loss')
#     plt.title('Loss over Epochs')
#     plt.xlabel('Epoch')
#     plt.ylabel('Loss')
#     plt.legend()
#     plt.subplot(1, 2, 2)
#     plt.plot(history.history['auc'], label='Train AUC')
#     plt.plot(history.history['val_auc'], label='Validation AUC')
#     plt.title('AUC over Epochs')
#     plt.xlabel('Epoch')
#     plt.ylabel('AUC')
#     plt.legend()
#     plt.tight_layout()
#     plt.show()



Loading pre-trained model for fine-tuning: distilbert-base-uncased...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Model loaded.
Compiling the model...
Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
 dropout_19 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 66954241 (255.41 MB)
Trainable params: 66954241 (255.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model com

Prediction And Submission 

In [4]:
import numpy as np # Import numpy for array operations

# Make predictions on the preprocessed test data
print("Generating predictions on the test data...")
# The model outputs logits, which need to be converted to probabilities using sigmoid
logits = model.predict(test_dataset).logits # Access the 'logits' attribute from the model output
predictions = tf.nn.sigmoid(logits).numpy().flatten() # Apply sigmoid and flatten
print("Predictions generated.")

# Create the submission DataFrame
submission_df = sample_submission_df.copy()

# Assign the predictions to the 'prediction' column
submission_df['prediction'] = predictions

# Display the first few rows of the submission file
print("\nSubmission file head:")
print(submission_df.head())

# Save the submission file
submission_file_name = 'submission_transformer.csv' # Changed filename
submission_df.to_csv(submission_file_name, index=False)

print(f"\nSubmission file '{submission_file_name}' created successfully!")
print("You can now submit this file to the Kaggle competition.")


Generating predictions on the test data...
Predictions generated.

Submission file head:
   row_id  rule_violation  prediction
0    2029             0.5    0.014177
1    2030             0.5    0.323833
2    2031             0.5    0.971374
3    2032             0.5    0.971239
4    2033             0.5    0.969470

Submission file 'submission_transformer.csv' created successfully!
You can now submit this file to the Kaggle competition.
