In [1]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [2]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Laptop GPU'

In [3]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    AutoTokenizer,
    set_seed,
    LlamaTokenizer,
    LlamaForSequenceClassification,
)
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging

In [4]:
import torch

data = pd.read_json('subtaskA_train_monolingual.jsonl', lines = True)
df = data[['text','label']]


In [5]:
df = df.reset_index(drop=True)

In [6]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# tokenizer.padding_side = "left"
# tokenizer.pad_token = tokenizer.eos_token

In [7]:

huggingdata = Dataset.from_pandas(df)

In [8]:
huggingdata 

Dataset({
    features: ['text', 'label'],
    num_rows: 119757
})

In [9]:
# df['tokenized'] = tokenizer(df['text'], truncation = True, padding = True)

In [10]:
# X = df['text'].to_list()

In [11]:
# X_new = tokenizer(X,truncation=True, padding=True)

In [12]:
df_train, df_test = train_test_split(df,test_size=None, shuffle = True)

In [13]:
from datasets import Dataset

# from pandas
train_ds = Dataset.from_pandas(df_train)
test_ds = Dataset.from_pandas(df_test)

In [14]:
def preprocess_function(examples, max_length=512):
    return tokenizer(examples["text"], truncation=True, max_length=max_length, padding=True)

train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
test_tokenized_ds = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/89817 [00:00<?, ? examples/s]

Map:   0%|          | 0/29940 [00:00<?, ? examples/s]

In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy_val = accuracy_score(labels, predictions)
    roc_auc_val = roc_auc_score(labels, predictions)
    f1_score_val_macro = f1_score(labels, predictions, average = "macro")
    f1_score_val_micro = f1_score(labels, predictions, average = "micro")
    
    return {
        "accuracy": accuracy_val,
        "roc_auc": roc_auc_val,
        "f1_score_val_macro" : f1_score_val_macro,
        "f1_score_val_micro" : f1_score_val_micro
    }

In [17]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=6,  # batch size per device during training
    per_device_eval_batch_size=6,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    # num_train_epochs=3,
    # weight_decay=0.01,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2, device_map={"":0})

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_tokenized_ds,         # training dataset
    eval_dataset=test_tokenized_ds,            # evaluation dataset
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Roc Auc,F1 Score Val Macro,F1 Score Val Micro
1,0.0042,0.176343,0.96002,0.961854,0.959974,0.96002
2,0.1167,0.127453,0.974282,0.97502,0.974213,0.974282
3,0.0,0.226924,0.969673,0.97105,0.96962,0.969673


TrainOutput(global_step=44910, training_loss=0.08855270806633665, metrics={'train_runtime': 9351.7543, 'train_samples_per_second': 28.813, 'train_steps_per_second': 4.802, 'total_flos': 3.5693473035257856e+16, 'train_loss': 0.08855270806633665, 'epoch': 3.0})

In [18]:
test_tokenized_ds

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 29940
})

In [19]:
dataTest = pd.read_json('subtaskA_monolingual.jsonl', lines = True)
df2 = dataTest[['text']]
df2 = df2.reset_index(drop=True)
df_test_ds = Dataset.from_pandas(df2)
df_test_tokenized_ds = df_test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/34272 [00:00<?, ? examples/s]

In [20]:
pred_output = trainer.predict(df_test_tokenized_ds)
logits = pred_output.predictions

In [21]:
prob_pred = softmax(logits, axis=-1)
preds = np.argmax(logits, axis=-1)

In [22]:
metric1 = evaluate.load("accuracy")

In [23]:
GoldDataset = pd.read_json('subtaskA_monolingual_gs.jsonl', lines = True)

In [24]:
GoldDataset

Unnamed: 0,text,label,id
0,"Today, many adults or teenage drivers are hook...",0,0
1,"The automobile, since its advent, has revoluti...",1,1
2,One policy that could potentially improve aca...,1,2
3,Title: Navigating the Road Ahead: The Case for...,1,3
4,Have you ever woken up in the morning and wish...,0,4
...,...,...,...
34267,There are many advantages of limiting car usag...,0,34267
34268,When discussing the merits of the electoral co...,1,34268
34269,In favor of student-designed summer assignment...,1,34269
34270,"No, FACE is not created by aliens. as a person...",0,34270


In [25]:
GoldDataset_labels = GoldDataset['label'].to_list()

In [27]:
metric_classification = evaluate.load("bstrai/classification_report")
results = metric_classification.compute(predictions=preds, references=GoldDataset_labels)
print(results)

{'0': {'precision': 0.8815458937198067, 'recall': 0.5607177974434612, 'f1-score': 0.6854481256103974, 'support': 16272.0}, '1': {'precision': 0.7011955522113535, 'recall': 0.9318888888888889, 'f1-score': 0.8002480797671867, 'support': 18000.0}, 'accuracy': 0.7556605975723623, 'macro avg': {'precision': 0.7913707229655802, 'recall': 0.746303343166175, 'f1-score': 0.7428481026887921, 'support': 34272.0}, 'weighted avg': {'precision': 0.7868240757006612, 'recall': 0.7556605975723623, 'f1-score': 0.7457422191801397, 'support': 34272.0}}


In [28]:
f1_metric = evaluate.load("f1")

In [29]:
results_f1_micro = f1_metric.compute(predictions=preds, references=GoldDataset_labels, average = "micro")

In [30]:
print(results_f1_micro)

{'f1': 0.7556605975723623}


In [31]:
results_f1_macro = f1_metric.compute(predictions=preds, references=GoldDataset_labels, average = "macro")
print(results_f1_macro)

{'f1': 0.7428481026887921}


In [32]:
accuracy = metric1.compute(predictions=preds, references=GoldDataset_labels)
print(accuracy)

{'accuracy': 0.7556605975723623}


In [33]:
GoldDataset_labels

[0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,


In [77]:
json_result_df = GoldDataset['id']

In [78]:
json_result_df = json_result_df.to_frame()

In [79]:
# for i in preds:
#     json_result_df['label']= i 

In [80]:
json_result_df

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
...,...
34267,34267
34268,34268
34269,34269
34270,34270


In [81]:
pred_df = pd.DataFrame({'label':preds})

In [82]:
pred_df

Unnamed: 0,label
0,0
1,1
2,1
3,1
4,1
...,...
34267,1
34268,1
34269,1
34270,0


In [85]:
json_result_df_final = pd.concat([json_result_df, pred_df], axis = 1)

In [86]:
json_result_df_final

Unnamed: 0,id,label
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
34267,34267,1
34268,34268,1
34269,34269,1
34270,34270,0


In [88]:
pwd

'C:\\Users\\siddh'

In [89]:
json_result_df_final.to_json('json_predictions_distilbert', orient='records', lines=True)