In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [2]:
#train_data = pd.read_csv('/kaggle/input/data-fr-bert/cleaned_train_data_2.csv')
test_data = pd.read_csv('/kaggle/input/data-fr-bert/test_data_1.csv')
train_data_huge=pd.read_csv('/kaggle/input/duze-dane/extended_dataset_1.csv')

In [3]:
print(train_data_huge['label'].dtype)


int64


In [4]:
def load_data(data_file):
    df = train_data_huge
    texts = df['tweet'].tolist()
    labels = df['label'].tolist()
    return texts, labels

texts, labels = load_data(train_data_huge)

In [5]:
texts_1, labels_1 = load_data(train_data_huge)

In [6]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [8]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [9]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [10]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return 1 if preds.item() == 1 else 0


In [11]:
 bert_model_name = 'bert-base-uncased'
 num_classes = 2
 max_length = 128
 batch_size = 16
 num_epochs = 4
 learning_rate = 2e-5

In [12]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [13]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [15]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [16]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/4
Validation Accuracy: 0.9822
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     14058
           1       0.99      0.98      0.98     14978

    accuracy                           0.98     29036
   macro avg       0.98      0.98      0.98     29036
weighted avg       0.98      0.98      0.98     29036

Epoch 2/4
Validation Accuracy: 0.9872
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     14058
           1       0.99      0.98      0.99     14978

    accuracy                           0.99     29036
   macro avg       0.99      0.99      0.99     29036
weighted avg       0.99      0.99      0.99     29036

Epoch 3/4
Validation Accuracy: 0.9892
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     14058
           1       0.99      0.99      0.99     14978

    accuracy                           0.99     29036
   macro avg  

In [17]:
torch.save(model.state_dict(), "bert_classifier.pth")

In [18]:
predictions = []
for tweet in test_data['tweet']:  
    prediction = predict_sentiment(tweet, model, tokenizer, device)
    predictions.append(prediction)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy = accuracy_score(test_data['label'], predictions)
conf_matrix = confusion_matrix(test_data['label'], predictions)
class_report = classification_report(test_data['label'], predictions)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Accuracy: 0.7055555555555556
Confusion Matrix:
 [[86  4]
 [49 41]]
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.96      0.76        90
           1       0.91      0.46      0.61        90

    accuracy                           0.71       180
   macro avg       0.77      0.71      0.69       180
weighted avg       0.77      0.71      0.69       180



In [19]:
test_data

Unnamed: 0,tweet,label
0,world health organization confirms vaccination...,1
1,covid spread mainly respiratory droplet cough ...,1
2,health expert recommend frequent handwashing p...,1
3,true covid vaccine tested large clinical trial...,1
4,proper ventilation help reduce spread covid in...,1
...,...,...
175,new covid variant discovered infect exposure s...,0
176,researcher find covid computer hard drive rema...,0
177,covid engineered target specific ethnic group ...,0
178,scientist discover covid outer space confirmin...,0
