In [7]:
# Gerekli Kütüphaneler
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch import nn
from tqdm import tqdm

warnings.filterwarnings('ignore')

# GPU Kontrolü
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_name(i))  # GPU adlarını listeler
else:
    print("No GPUs found")

# Veriyi Yükleme
reviews = pd.read_csv("/kaggle/input/eticaret-urun-yorumlari/e-ticaret_urun_yorumlari.csv", on_bad_lines="skip", delimiter=";")
print(reviews.shape)
print(reviews.sample(5))

# Veriyi Temizleme
reviews.drop_duplicates(keep='first', inplace=True)
print(reviews.shape)
print(reviews['Durum'].value_counts(normalize=True, dropna=False))

# Özellikler ve Etiketler
X = reviews['Metin'].values
y = reviews['Durum'].values

# Tokenizer ve Modeli Yükleme
checkpoint_name = "balciberin/distilbert_turkish_sentiment_analysis2"  # DistilBERT modeli
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint_name)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint_name, num_labels=3, ignore_mismatched_sizes=True)

# Tokenizasyon
tokenized_inputs = tokenizer(X.tolist(),  
                             padding='max_length',  
                             truncation=True,  
                             max_length=65,  
                             return_tensors='pt')

input_ids = tokenized_inputs['input_ids']
attention_mask = tokenized_inputs['attention_mask']

labels = torch.tensor(y, dtype=torch.long)

# Dataset ve DataLoader
dataset = TensorDataset(input_ids, attention_mask, labels)

split = 0.9
total_size = len(dataset)
train_size = int(total_size * split)
val_size = total_size - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)  # Validation için shuffle=False

# Optimizasyon ve Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

total_steps = num_epochs * len(train_dataloader)
warmup_percentage = 0.1
num_warmup_steps = int(total_steps * warmup_percentage)

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=num_warmup_steps, 
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

# Eğitim Fonksiyonu
def train_epoch(model, train_dataloader, optimizer, loss_fn, scheduler, device, progress_bar=None):
    model.train()
    total_loss = 0
    
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        if progress_bar:
            progress_bar.update(1)
    
    avg_loss = total_loss / len(train_dataloader)
    return avg_loss

# Değerlendirme Fonksiyonu
def eval_epoch(model, val_dataloader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += len(labels)

    avg_loss = total_loss / len(val_dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

# Eğitim ve Değerlendirme
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    train_loss = 0
    with tqdm(total=len(train_dataloader), desc=f"Training Epoch {epoch + 1}") as pbar:
        train_loss = train_epoch(model, train_dataloader, optimizer, loss_fn, scheduler, device, pbar)

    val_loss, val_accuracy = eval_epoch(model, val_dataloader, loss_fn, device)

    print(f"Training loss: {train_loss:.4f}")
    print(f"Validation loss: {val_loss:.4f}, Validation accuracy: {val_accuracy:.4f}")

# Modeli Kaydetme
model.save_pretrained("DistilBert_sentiment_Turkish_product_reviews")

# Modeli Yükleme
model_loaded = DistilBertForSequenceClassification.from_pretrained("DistilBert_sentiment_Turkish_product_reviews")
model_loaded.to(device)

# Yeni Tahminler
def make_new_prediction(raw_review, model):
    tokenized_review = tokenizer(
        raw_review,
        return_tensors='pt', 
        padding=True,
        truncation=True,
        max_length=65
    )
    
    input_ids = tokenized_review['input_ids'].to(device)
    attention_mask = tokenized_review['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    predicted_class = torch.argmax(logits, dim=1).item()

    class_labels = {0: 'Negative', 1: 'Positive', 2: 'Neutral'}
    predicted_label = class_labels[predicted_class]

    print("Predicted class for the new review:", predicted_label)
    
    return predicted_label

# Örnek bir inceleme
new_review = "guzel bir urundu, begendim, fiyati da gayet makul."
make_new_prediction(new_review, model_loaded)


Tesla T4
Tesla T4
(15170, 2)
                                                   Metin  Durum
11002  çözemedik kötü geldi kullanışlı değil hiç ve s...      0
15088  2 kullanmada patladı içi yanmış kullanamadım bile      0
13491  O kadar kalitesiz ki kutuya bile koymaya gerek...      0
9412          teşekürler ürün bekledigim gibi geldi ????      1
6722   hem fiyatı hemde kalitesi süper oğlum bayıldı ...      1
(13569, 2)
Durum
0    0.477854
1    0.427003
2    0.095143
Name: proportion, dtype: float64


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at balciberin/distilbert_turkish_sentiment_analysis2 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 381/381 [01:01<00:00,  6.21it/s]


Training loss: 0.5847
Validation loss: 0.3325, Validation accuracy: 0.8757
Epoch 2/3


Training Epoch 2: 100%|██████████| 381/381 [01:04<00:00,  5.89it/s]


Training loss: 0.3135
Validation loss: 0.3120, Validation accuracy: 0.8847
Epoch 3/3


Training Epoch 3: 100%|██████████| 381/381 [01:04<00:00,  5.89it/s]


Training loss: 0.2488
Validation loss: 0.2771, Validation accuracy: 0.8996
Predicted class for the new review: Positive


'Positive'

In [8]:
new_review = "turkcellin bu hizmeti ne abi ya baya kötü"
make_new_prediction(new_review, model_loaded)

Predicted class for the new review: Negative


'Negative'