In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import backend as K
from sklearn.metrics import balanced_accuracy_score

from sklearn.model_selection import train_test_split

import os
import sys
from tqdm import tqdm

In [2]:
def create_submission(predicted, path = "submission.csv"):
    folder_loc = '/'.join([i for i in path.split("/")][:-1])
    if not os.path.exists(folder_loc) and folder_loc != '':
        os.makedirs(folder_loc)
    df = pd.read_excel("Data/Submission_Format.xlsx")
    df["label"] = predicted
    df.to_csv(path, index=False)

In [3]:
# DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [4]:
# input Data
df_handling = pd.read_csv("../Temp/Handling/nodrop_v2_translated_lemarized_stopwords.csv").iloc[:,1:]
df_nonhandling = pd.read_csv("../Temp/cleaned_datav2_translated_lemarized_stopwords.csv")

# Tensorflow Bi-LSTM

In [5]:
# Was run on
print(f'PY version   : {sys.version}\nHardware     : {tf.config.list_physical_devices()[-1]}')

PY version   : 3.9.18 (main, Sep 11 2023, 14:09:26) [MSC v.1916 64 bit (AMD64)]
Hardware     : PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


## Handling

In [15]:
label_encoder = LabelEncoder()
df_handling['label'] = label_encoder.fit_transform(df_handling['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [17]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_handling["text"], df_handling["label"], test_size=0.2, random_state=42)

In [18]:
#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

def get_paddedsequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

In [19]:
def get_ds(padsequences, labels):
    ds = tf.data.Dataset.from_tensor_slices((padsequences, labels))
    ds = ds.cache()
    ds = ds.batch(32)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [20]:
def get_model():
    model = tf.keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        keras.layers.BatchNormalization(),
        
        keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
        
        keras.layers.GlobalMaxPool1D(),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.3),
        
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.4),
        
        keras.layers.Dense(8, activation='softmax')
    ])
    
    model.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])
    return model

In [22]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14ff1194760>

In [23]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



0.9383464754103183

## Train Test split then smote

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

In [10]:
df = pd.read_csv("../Temp/cleaned_datav2_translated_lemarized_stopwords.csv")

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [11]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

def get_paddedsequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

In [19]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(training_pad_sequences, train_labels)

In [16]:
def get_ds(padsequences, labels):
    ds = tf.data.Dataset.from_tensor_slices((padsequences, labels))
    ds = ds.cache()
    ds = ds.batch(32)
    return ds.prefetch(tf.data.AUTOTUNE)

def get_model():
    model = tf.keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        keras.layers.BatchNormalization(),
        
        keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
        
        keras.layers.GlobalMaxPool1D(),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.3),
        
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.4),
        
        keras.layers.Dense(8, activation='softmax')
    ])
    
    model.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])
    return model

In [None]:
train_ds=get_ds(X_resampled, y_resampled)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [22]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21045bc46a0>

In [None]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)

## Non Handling

### Translated

In [17]:
df = pd.read_csv("../Temp/cleaned_datav2_translated_lemarized_stopwords.csv")

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [18]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_nonhandling["text"], df_nonhandling["label"], test_size=0.2, random_state=42)

In [19]:
training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [20]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x218c5891280>

In [21]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



0.47518928361282176

### Non Translated

In [24]:
df = pd.read_csv("../Temp/cleaned_datav2_lemarized_stopwords.csv").iloc[:,1:]

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [25]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_nonhandling["text"], df_nonhandling["label"], test_size=0.2, random_state=42)

In [26]:
training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

train_ds=get_ds(training_pad_sequences, train_labels)
val_ds=get_ds(validation_pad_sequences, val_labels)

In [27]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x218ba0ad280>

In [28]:
predicted_class_labels = model.predict(validation_pad_sequences)
predicted_class_labels = np.argmax(predicted_class_labels, axis=1)
balanced_accuracy_score(val_labels, predicted_class_labels)



0.4528778951353417

# Catboost [Not an option]

In [5]:
import catboost as cb

## Handling

In [9]:
label_encoder = LabelEncoder()
df_handling['label'] = label_encoder.fit_transform(df_handling['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [28]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_handling["text"], df_handling["label"], test_size=0.2, random_state=42)

#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

def get_paddedsequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

train_dataset = cb.Pool(training_pad_sequences, train_labels) 
test_dataset = cb.Pool(validation_pad_sequences, val_labels)

catboost = cb.CatBoostClassifier(iterations=20000)

catboost.fit(train_dataset, eval_set=(test_dataset))


Learning rate set to 0.047523
0:	learn: 2.0437493	test: 2.0465837	best: 2.0465837 (0)	total: 97.9ms	remaining: 16m 18s
1:	learn: 2.0089308	test: 2.0136039	best: 2.0136039 (1)	total: 174ms	remaining: 14m 30s
2:	learn: 1.9776443	test: 1.9832424	best: 1.9832424 (2)	total: 254ms	remaining: 14m 7s
3:	learn: 1.9488999	test: 1.9556746	best: 1.9556746 (3)	total: 337ms	remaining: 14m 2s
4:	learn: 1.9175939	test: 1.9254749	best: 1.9254749 (4)	total: 421ms	remaining: 14m 2s
5:	learn: 1.8929647	test: 1.9022251	best: 1.9022251 (5)	total: 498ms	remaining: 13m 49s
6:	learn: 1.8677464	test: 1.8777962	best: 1.8777962 (6)	total: 587ms	remaining: 13m 58s
7:	learn: 1.8446114	test: 1.8552403	best: 1.8552403 (7)	total: 679ms	remaining: 14m 8s
8:	learn: 1.8230372	test: 1.8344736	best: 1.8344736 (8)	total: 771ms	remaining: 14m 16s
9:	learn: 1.8006289	test: 1.8125929	best: 1.8125929 (9)	total: 850ms	remaining: 14m 9s
10:	learn: 1.7814575	test: 1.7939192	best: 1.7939192 (10)	total: 925ms	remaining: 13m 59s
11:	

<catboost.core.CatBoostClassifier at 0x113ccca3090>

In [29]:
Train_Predicted = catboost.predict(training_pad_sequences)
Val_Predicted = catboost.predict(validation_pad_sequences)
balanced_accuracy_score(val_labels, Val_Predicted)

## Non Handling

In [None]:
label_encoder = LabelEncoder()
df_nonhandling['label'] = label_encoder.fit_transform(df_nonhandling['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

In [32]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_nonhandling["text"], df_nonhandling["label"], test_size=0.2, random_state=42)

#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)

def get_paddedsequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_pad_sequences=get_paddedsequences(train_sentences)
validation_pad_sequences=get_paddedsequences(val_sentences)

train_dataset = cb.Pool(training_pad_sequences, train_labels) 
test_dataset = cb.Pool(validation_pad_sequences, val_labels)

catboost = cb.CatBoostClassifier(iterations=20000)

catboost.fit(train_dataset, eval_set=(test_dataset))


Learning rate set to 0.035347
0:	learn: 2.0074050	test: 2.0052243	best: 2.0052243 (0)	total: 53.6ms	remaining: 17m 51s
1:	learn: 1.9470791	test: 1.9436295	best: 1.9436295 (1)	total: 93.8ms	remaining: 15m 37s
2:	learn: 1.8959495	test: 1.8906992	best: 1.8906992 (2)	total: 134ms	remaining: 14m 51s
3:	learn: 1.8511321	test: 1.8444023	best: 1.8444023 (3)	total: 163ms	remaining: 13m 35s
4:	learn: 1.8113111	test: 1.8030895	best: 1.8030895 (4)	total: 198ms	remaining: 13m 13s
5:	learn: 1.7763056	test: 1.7668854	best: 1.7668854 (5)	total: 235ms	remaining: 13m 4s
6:	learn: 1.7438004	test: 1.7341297	best: 1.7341297 (6)	total: 287ms	remaining: 13m 40s
7:	learn: 1.7146536	test: 1.7039238	best: 1.7039238 (7)	total: 332ms	remaining: 13m 49s
8:	learn: 1.6886856	test: 1.6769786	best: 1.6769786 (8)	total: 376ms	remaining: 13m 56s
9:	learn: 1.6650796	test: 1.6527500	best: 1.6527500 (9)	total: 412ms	remaining: 13m 44s
10:	learn: 1.6439595	test: 1.6308022	best: 1.6308022 (10)	total: 445ms	remaining: 13m 28s

<catboost.core.CatBoostClassifier at 0x113a4d2de50>

In [33]:
Train_Predicted = catboost.predict(training_pad_sequences)
Val_Predicted = catboost.predict(validation_pad_sequences)
balanced_accuracy_score(val_labels, Val_Predicted)

0.3519828087983589

Conclusion: Catboost offers lower accuracy score than BI-LSTM methods

# Indobert

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

## Non Handling

In [15]:
df = pd.read_csv("../Temp/cleaned_datav2_translated_lemarized_stopwords.csv")

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

train_sentences, val_sentences, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

{'Demografi': 0, 'Ekonomi': 1, 'Geografi': 2, 'Ideologi': 3, 'Pertahanan dan Keamanan': 4, 'Politik': 5, 'Sosial Budaya': 6, 'Sumber Daya Alam': 7}


In [7]:
# Load IndoBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')
model = BertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased', num_labels=len(label_mapping))

# Check if GPU is available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Move model to the same device
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [49]:
# Create a dataset class with correct label type
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
# Tokenize the input texts
train_inputs = tokenizer(train_sentences.to_list(), padding=True, truncation=True, max_length=512, return_tensors='pt')
train_labels_ten = torch.tensor(train_labels.values)

val_inputs = tokenizer(val_sentences.to_list(), padding=True, truncation=True, max_length=512, return_tensors='pt')
val_labels_ten = torch.tensor(val_labels.values)

train_dataset = SimpleDataset(train_inputs, train_labels_ten)
val_dataset = SimpleDataset(val_inputs, val_labels_ten)

In [48]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=5,              # Number of training epochs
    per_device_train_batch_size=2,   # Batch size for training
    per_device_eval_batch_size=2,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    evaluation_strategy="epoch",     # Evaluation strategy to run validation at each epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
)

# Train the model
trainer.train()

  0%|          | 0/9980 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.134, 'grad_norm': 290.0127258300781, 'learning_rate': 5e-05, 'epoch': 0.25}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.4359, 'grad_norm': 17.255048751831055, 'learning_rate': 4.7362869198312235e-05, 'epoch': 0.5}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.4706, 'grad_norm': 14.588626861572266, 'learning_rate': 4.4725738396624474e-05, 'epoch': 0.75}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.5628, 'grad_norm': 14.014142036437988, 'learning_rate': 4.208860759493671e-05, 'epoch': 1.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.4528, 'grad_norm': 12.215002059936523, 'learning_rate': 3.945147679324895e-05, 'epoch': 1.25}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.3297, 'grad_norm': 17.00127410888672, 'learning_rate': 3.6814345991561184e-05, 'epoch': 1.5}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.5392, 'grad_norm': 3.1900532245635986, 'learning_rate': 3.4177215189873416e-05, 'epoch': 1.75}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.4667, 'grad_norm': 14.986444473266602, 'learning_rate': 3.1540084388185655e-05, 'epoch': 2.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.5591, 'grad_norm': 1.548494815826416, 'learning_rate': 2.8902953586497894e-05, 'epoch': 2.25}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.4518, 'grad_norm': 26.494274139404297, 'learning_rate': 2.626582278481013e-05, 'epoch': 2.51}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.5292, 'grad_norm': 12.50636100769043, 'learning_rate': 2.3628691983122365e-05, 'epoch': 2.76}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.5749, 'grad_norm': 15.781134605407715, 'learning_rate': 2.09915611814346e-05, 'epoch': 3.01}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'loss': 1.4519, 'grad_norm': 19.882415771484375, 'learning_rate': 1.8354430379746836e-05, 'epoch': 3.26}


SafetensorError: Error while serializing: IoError(Os { code: 112, kind: StorageFull, message: "There is not enough space on the disk." })

In [None]:
# Tokenize input text
text = val_sentences.to_list()
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Move input tensors to the same device as the model
inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

# Perform inference
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)

# Get predictions
predictions = torch.argmax(outputs.logits, dim=1)

predicted_label = label_encoder.inverse_transform(predictions.cpu())[0]
predicted_label

In [46]:
pred_list = []
for text in tqdm(val_sentences.to_list()):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

    # Perform inference
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predictions
    predictions = torch.argmax(outputs.logits, dim=1)
    pred_list.append(predictions.cpu().numpy()[0])

100%|██████████| 999/999 [00:15<00:00, 63.47it/s]


In [47]:
balanced_accuracy_score(val_labels, pred_list)

0.40903402471393574