In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, load_metric
from sklearn.preprocessing import LabelEncoder

# Load data
labeled_data_path = '/kaggle/input/datathon-2024/checkworthiness_labeled.csv'
leaderboard_data_path = '/kaggle/input/datathon-2024/checkworthiness_leaderboard.csv'
labeled_data = pd.read_csv(labeled_data_path)
leaderboard_data = pd.read_csv(leaderboard_data_path)

# Preprocess and encode labels
def preprocess_text(text):
    return text.lower()  # Convert to lowercase

labeled_data['Text'] = labeled_data['Text'].apply(preprocess_text)
leaderboard_data['Text'] = leaderboard_data['Text'].apply(preprocess_text)

# Encode labels as integers
label_encoder = LabelEncoder()
labeled_data['labels'] = label_encoder.fit_transform(labeled_data['Category'])

# Load tokenizer and model
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(label_encoder.classes_))

# Tokenization function that includes truncation and padding
def tokenize_function(examples):
    return tokenizer(examples['Text'], truncation=True, padding='max_length', max_length=128)

from sklearn.model_selection import train_test_split
from datasets import DatasetDict
from datasets import Dataset

# Convert to datasets and apply the tokenization
full_dataset = Dataset.from_pandas(labeled_data[['Text', 'labels']])
full_dataset = full_dataset.map(tokenize_function, batched=True)

# Splitting the dataset into training and evaluation datasets
train_test_split = full_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

# Initialize the Trainer with eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Tokenize the leaderboard dataset
leaderboard_dataset = Dataset.from_pandas(leaderboard_data)
leaderboard_dataset = leaderboard_dataset.map(tokenize_function, batched=True)

# Make predictions
predictions = trainer.predict(leaderboard_dataset)
preds = predictions.predictions.argmax(-1)

# Decode predictions
decoded_preds = label_encoder.inverse_transform(preds)

# Prepare the submission file
leaderboard_data['Category'] = decoded_preds
leaderboard_data.to_csv('submission.csv', index=False)
print("The model has been trained and the predictions are saved in 'submission.csv'.")


In [1]:
# Install required libraries
!pip install transformers datasets

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

# Load the dataset (Assume it's already uploaded to Kaggle input directory)
labeled_data = pd.read_csv('/kaggle/input/datathon-2024/checkworthiness_labeled.csv')
leaderboard_data = pd.read_csv('/kaggle/input/datathon-2024/checkworthiness_leaderboard.csv')

# Preprocessing and Tokenization setup
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples['Text'], truncation=True, padding='max_length', max_length=128)

# Encode labels
label_encoder = LabelEncoder()
labeled_data['labels'] = label_encoder.fit_transform(labeled_data['Category'])

# Dataset creation and split
full_dataset = Dataset.from_pandas(labeled_data[['Text', 'labels']])
full_dataset = full_dataset.map(tokenize_function, batched=True)
train_test_split = full_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Training
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=2
)

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_encoder.classes_))
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)
trainer.train()


[0m

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.277,0.224114
2,0.1658,0.222762
3,0.0908,0.270117




NameError: name 'gc' is not defined

In [3]:

# Predicting and saving output
leaderboard_dataset = Dataset.from_pandas(leaderboard_data)
leaderboard_dataset = leaderboard_dataset.map(tokenize_function, batched=True)
predictions = trainer.predict(leaderboard_dataset)
preds = predictions.predictions.argmax(-1)
decoded_preds = label_encoder.inverse_transform(preds)
leaderboard_data['Category'] = decoded_preds
leaderboard_data.to_csv('submission.csv', index=False)


  0%|          | 0/2 [00:00<?, ?ba/s]