In [None]:
!pip install transformers datasets torch scikit-learn

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
file_path = '/content/drive/MyDrive/large_image_descriptions.csv'
df = pd.read_csv(file_path)
print(df.head())

                                         description        label
0    Spilled drinks and food are seen at platform 1.  Cleanliness
1            Loose bolts and railings at platform 3.      Defects
2         Unsanitary conditions in the waiting area.  Cleanliness
3    Service interruptions caused by signal failure.      Service
4  The train is delayed due to mechanical breakdown.      Service


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [7]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
train_texts, val_texts, train_labels, val_labels = train_test_split(df['description'], df['label'], test_size=0.2, random_state=42)


In [8]:
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 800
Validation samples: 200


In [None]:
#tokenization
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_and_encode(texts, tokenizer, max_length=128):
  return tokenizer(
      texts.tolist(),
      max_length= max_length,
      padding='max_length',
      trucation=True,
      return_tensors='pt'
 )

train_encodings = tokenize_and_encode(train_texts, tokenizer)
val_encodings = tokenize_and_encode(val_texts, tokenizer)

In [14]:
import torch

class TextDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = torch.tensor(labels.tolist())
  def __getitem__(self,idx):
    item ={key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = self.labels[idx]
    return item
  def __len__(self):
    return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset (val_encodings, val_labels)

In [15]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

#training

training_args = TrainingArguments(
    output_dir ='./results',
    num_train_epochs =3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

#initializing trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
trainer.train()


  item ={key:torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss


Step,Training Loss


TrainOutput(global_step=300, training_loss=0.3311644490559896, metrics={'train_runtime': 3545.1094, 'train_samples_per_second': 0.677, 'train_steps_per_second': 0.085, 'total_flos': 157868050636800.0, 'train_loss': 0.3311644490559896, 'epoch': 3.0})

In [17]:
eval_results = trainer.evaluate()
print(eval_results)

  item ={key:torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.0013233438367024064, 'eval_runtime': 120.6013, 'eval_samples_per_second': 1.658, 'eval_steps_per_second': 0.207, 'epoch': 3.0}


In [18]:

model.save_pretrained('/content/drive/MyDrive/bert-text-classification-model')
tokenizer.save_pretrained('/content/drive/MyDrive/bert-text-classification-tokenizer')


('/content/drive/MyDrive/bert-text-classification-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/bert-text-classification-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/bert-text-classification-tokenizer/vocab.txt',
 '/content/drive/MyDrive/bert-text-classification-tokenizer/added_tokens.json')