In [1]:
import os
import pandas as pd
import torch
import random
import numpy as np
import torch.nn as nn
import math
import argparse
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AutoTokenizer
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from tqdm import tqdm
scores = []

In [26]:
class Create_Dataset(Dataset):
    def __init__(self, data, tokenizer, max_token_len: int = 256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.text = ""

    def __len__(self):
        return len(self.data)

    def getText(self):
        return self.text

    def __getitem__(self, index):
        item = self.data['encoded_label'].values
        title = self.data["text"].values
        text = str(title[index])
        self.text = text
        label = torch.tensor(item[index])
        encoding = self.tokenizer.encode_plus(text,
                                              add_special_tokens=True,
                                              return_tensors='pt',
                                              truncation=True,
                                              max_length=self.max_token_len,
                                              padding='max_length',
                                              return_attention_mask=True
                                              )
        return {'input_ids': encoding.input_ids.flatten(), 'attention_mask': encoding.attention_mask.flatten()}, label



In [3]:

class Training:
    def __init__(self, model_name, data_path, epoch, batch, max_len, lr,
                 weight_decay, warmup, seed, device,save_directory):
        self.model_name = model_name
        self.data_path = data_path
        self.epoch = epoch
        self.batch = batch
        self.max_len = max_len
        self.lr = lr
        self.weight_decay = weight_decay
        self.warmup = warmup
        self.seed = seed
        self.device = device
        self.save_directory = save_directory
        
        self.split_size=0.10
        self.labels=None

        self.set_seed()
        

    def set_seed(self):
        torch.manual_seed(self.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        random.seed(self.seed)
        np.random.seed(self.seed)

    def data_preparation(self,data_path,batch,test_size):
        df=pd.read_csv(data_path)
        labels=df["label"].unique()
        self.labels=labels
        self.label2id, self.id2label=create_label2id_id2label(labels)
        df['encoded_label'] = df['label'].apply(lambda x: self.label2id[x])
        print("Class create success")
        print(df.encoded_label.value_counts())
        if any(df.encoded_label.value_counts() <= 2):
            traindf, valdf = train_test_split(df, test_size=test_size, random_state=42)
        else:
            traindf, valdf = train_test_split(df, test_size=test_size, random_state=42,
                                              stratify=df.encoded_label)
        
        train_dataset = Create_Dataset(traindf, self.tokenizer)
        val_dataset = Create_Dataset(valdf, self.tokenizer)
    
        train_dataloader = DataLoader(train_dataset, batch_size=batch, num_workers=4, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=batch, num_workers=4, shuffle=False)
        print(f"train:{traindf.shape} val:{valdf.shape} ")

        return train_dataloader,val_dataloader
    

    def root_training(self):
        self.tokenizer=AutoTokenizer.from_pretrained(self.model_name)
        train_dataloader,val_dataloader=self.data_preparation(self.data_path,self.batch,self.split_size)
        n_labels = len(self.labels)
        model = BertForSequenceClassification.from_pretrained(self.model_name, num_labels=n_labels, id2label=self.id2label,
                                                              label2id=self.label2id)
        device = self.device
        num_epochs = self.epoch
        weight_decay = self.weight_decay
        warmup = self.warmup
        lr = self.lr
        best_val_loss = float('inf')
        patience = 2
        model.to(device)
        total_steps = len(train_dataloader) * num_epochs
        warmup_steps = math.floor(total_steps * warmup)
        warmup_steps = max(1, warmup_steps)

        print(device)
        # Model ve kayıp fonksiyonu
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=warmup_steps, T_mult=1)

        running_loss = 0.0
        model_name = "model"
        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            num_batches = len(train_dataloader)
            with tqdm(total=num_batches, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch") as pbar:
                for batch_idx, batch in enumerate(train_dataloader):
                    inputs, labels = batch
                    input_ids = inputs["input_ids"].to(device)
                    attention_mask = inputs["attention_mask"].to(device)
                    labels = labels.to(device)
                    optimizer.zero_grad()
                    outputs = model(input_ids, attention_mask, labels=labels)
                    loss = criterion(outputs.logits, labels)
                    loss.backward()
                    optimizer.step()
                    scheduler.step(epoch + batch_idx / num_batches)
                    running_loss += loss.item()
                    # Ayrıntılı bilgi gösterme
                    pbar.set_postfix({"loss": loss.item(), "running_loss": running_loss / (batch_idx + 1)})
                    pbar.update(1)  # İlerleme çubuğunu güncelleme
            # Doğrulama
            model.eval()
            val_loss = 0.0
            for batch_idx, batch in enumerate(val_dataloader):
                inputs, labels = batch
                input_ids = inputs["input_ids"].to(device)
                attention_mask = inputs["attention_mask"].to(device)
                labels = labels.to(device)
                with torch.no_grad():
                    outputs = model(input_ids, attention_mask, labels=labels)
                    loss = criterion(outputs.logits, labels)
                val_loss += loss.item()
                # Early Stopping kontrolü
            val_loss /= len(val_dataloader)
            name = "Root_Model.pth"
            model_dir = "models"
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            model_path = os.path.join(model_dir, "best_model.pth")
            # En iyi modeli kaydetme
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                model_save_path = os.path.join(self.save_directory, "best_model")
                model.save_pretrained(model_save_path)
                self.tokenizer.save_pretrained(model_save_path)
            print(f"Epoch [{epoch + 1}/{self.epoch}], Train Loss: {running_loss / num_batches:.4f}, Validation Loss: {val_loss:.4f}")



In [23]:
def data_preparation(data_path,batch,test_size,tokenizer):
    df=pd.read_csv(data_path)
    labels=df["label"].unique()
    label2id, id2label=create_label2id_id2label(labels)
    df['encoded_label'] = df['label'].apply(lambda x: label2id[x])
    print("Class create success")
    print(df.encoded_label.value_counts())
    
    if any(df.encoded_label.value_counts() <= 2):
        traindf, valdf = train_test_split(df, test_size=test_size, random_state=42)

    else:
        traindf, valdf = train_test_split(df, test_size=test_size, random_state=42,
                                          stratify=df.encoded_label)
    tokenizer=AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
    train_dataset = Create_Dataset(traindf, tokenizer)
    val_dataset = Create_Dataset(valdf, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=batch, num_workers=4, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch, num_workers=4, shuffle=False)
    print(f"train:{traindf.shape} val:{valdf.shape} ")
    
    return train_dataloader,val_dataloader


    
    

In [8]:
def create_label2id_id2label(labels):
    # Etiketleri alfabetik olarak sırala
    sorted_labels = sorted(labels)
    
    # label2id ve id2label sözlüklerini oluştur
    label2id = {label: idx for idx, label in enumerate(sorted_labels)}
    id2label = {idx: label for idx, label in enumerate(sorted_labels)}
    
    return label2id, id2label

In [9]:
model_name="dbmdz/bert-base-turkish-cased"
data_path="/kaggle/input/product-sentiment/df_all.csv"
epoch=5
batch=32
max_len=512
lr=3e-5
weight_decay=3e-4
warmup=0.2
seed=42
device="cuda"
save_directory="/kaggle/working/"
test_size=0.10


In [None]:
training = Training(
    model_name,
    data_path,
    epoch,
    batch,
    max_len,
    lr,
    weight_decay,
    warmup,
    seed,
    device,
    save_directory
)


In [None]:
training.root_training()

tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

Class create success
encoded_label
2    235949
1    153825
0     50905
Name: count, dtype: int64
train:(396611, 5) val:(44068, 5) 


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


Epoch 1/5: 100%|██████████| 12395/12395 [2:37:14<00:00,  1.31batch/s, loss=0.00547, running_loss=0.0962]


Epoch [1/5], Train Loss: 0.0962, Validation Loss: 0.0801


Epoch 2/5: 100%|██████████| 12395/12395 [2:37:23<00:00,  1.31batch/s, loss=0.0171, running_loss=0.0652]   


Epoch [2/5], Train Loss: 0.0652, Validation Loss: 0.0753


Epoch 3/5:  41%|████      | 5101/12395 [1:04:45<1:32:56,  1.31batch/s, loss=0.0536, running_loss=0.0448] 

In [12]:
train_dataloader,val_dataloader=data_preparation(data_path,batch,test_size,tokenizer)


Class create success
encoded_label
2    235949
1    153825
0     50905
Name: count, dtype: int64


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

NameError: name 'Create_Dataset' is not defined

In [2]:
from transformers import AutoModelForSequenceClassification,pipeline

2024-08-08 07:08:01.264264: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-08 07:08:01.264407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-08 07:08:01.402927: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

# Hugging Face token ile giriş yapın
login("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")


# Model ve tokenizer yolunu belirtin
model_path = "/kaggle/working/best_model"

# Modeli ve tokenizer'ı yükleyin
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Model hub'a push'lamak için repository ismi belirleyin
repo_name = "moarslan/bert-base-turkish-sentiment-analysis"

# Modeli ve tokenizer'ı push'layın
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/moarslan/bert-base-turkish-sentiment-analysis/commit/e71fba7bf502c319c04bc044e4484ea275579cf6', commit_message='Upload tokenizer', commit_description='', oid='e71fba7bf502c319c04bc044e4484ea275579cf6', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/best_model")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/best_model")

In [17]:
df=pd.read_csv("/kaggle/input/product-sentiment/df_all.csv")
labels=sorted(df["label"].unique())
print(labels)

['Negative', 'Notr', 'Positive']


In [30]:
    from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
    def eval_root():
        loaded_model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/best_model")
        tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/best_model")
        df=pd.read_csv("/kaggle/input/product-sentiment/df_all.csv")
        labels=sorted(df["label"].unique())
        train_dataloader,val_dataloader=data_preparation("/kaggle/input/product-sentiment/df_all.csv",32,0.1,tokenizer)
        print(labels)
        device = "cuda"


        n_labels = len(labels)
        test_dataloader = val_dataloader
        
        true_labels = []
        predicted_labels = []
        loaded_model.to(device)
        loaded_model.eval()
        with tqdm(total=len(test_dataloader), unit="batch") as pbar:
            for batch in test_dataloader:
                inputs, labels = batch
                input_ids = inputs["input_ids"].to(device)
                attention_mask = inputs["attention_mask"].to(device)
                labels = labels.to(device)
                with torch.no_grad():
                    outputs = loaded_model(input_ids, attention_mask, labels=labels)
                    logits = outputs.logits
                    batch_predictions = torch.argmax(logits, axis=1)
                    true_labels.extend(labels.tolist())
                    predicted_labels.extend(batch_predictions.tolist())
                pbar.update(1)
        f1 = f1_score(true_labels, predicted_labels, average='macro')
        print("F1 Skoru:", f1)

        recall = recall_score(true_labels, predicted_labels, average='macro')
        print("Recall Skoru:", recall)

        precision = precision_score(true_labels, predicted_labels, average='macro')
        print("Precision Skoru:", precision)

        accuracy = accuracy_score(true_labels, predicted_labels)
        print("Accuracy:", accuracy)
        score = {"f1": f1, "recall": recall, "precision": precision, "accuracy": accuracy}
        return score


In [None]:
sentiment_analysis_pipeline = pipeline("sentiment-analysis", model=models, tokenizer=tokenizers)


In [55]:
text= "Bu kitapçık temel bilgiler içeriyor."

In [56]:
sentiment_analysis_pipeline(text)

[{'label': 'Notr', 'score': 0.997557520866394}]

In [31]:
eval_root()

Class create success
encoded_label
2    235949
1    153825
0     50905
Name: count, dtype: int64
train:(396611, 5) val:(44068, 5) 
['Negative', 'Notr', 'Positive']


  self.pid = os.fork()
  self.pid = os.fork()
100%|██████████| 1378/1378 [05:31<00:00,  4.16batch/s]


F1 Skoru: 0.9548370318776405
Recall Skoru: 0.9521375413838135
Precision Skoru: 0.9576279141052209
Accuracy: 0.9739947354089135


{'f1': 0.9548370318776405,
 'recall': 0.9521375413838135,
 'precision': 0.9576279141052209,
 'accuracy': 0.9739947354089135}