In [3]:
!pip install -q transformers datasets accelerate evaluate scikit-learn

from google.colab import drive
drive.mount('/content/drive')

import os, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import torch, evaluate, numpy as np
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
from datasets import Dataset, DatasetDict


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# CSV yolu (kategori sütunu bu dosyada olmalı)
CSV_PATH = "/content/drive/MyDrive/yorumlar/isim_yorum_temiz_etiket.csv"

# Metin hangi kolondan alınacak? (temiz yorum varsa onu, yoksa yorum)
TEXT_COL_CANDIDATES = ["temiz yorum", "temiz_yorum", "temizyorum", "yorum"]
LABEL_COL = "kategori"   # biraz önce yazdığımız son kategori

# Model ve çıktı
MODEL_NAME = "dbmdz/bert-base-turkish-uncased"  # BERTurk
MAX_LEN = 160
BATCH_SIZE = 16
EPOCHS = 20
LR = 2e-5
SEED = 42

OUTPUT_DIR = "/content/drive/MyDrive/yorumlar/berturk_kategori_modeli"
os.makedirs(OUTPUT_DIR, exist_ok=True)

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


In [5]:
df = pd.read_csv(CSV_PATH, encoding="utf-8-sig")

# Metin kolonu tespiti
lower_map = {c.lower().strip(): c for c in df.columns}
def pick_text_col():
    for c in TEXT_COL_CANDIDATES:
        if c in lower_map: return lower_map[c]
    raise ValueError("Metin kolonu bulunamadı. CSV'de 'temiz yorum' ya da 'yorum' yok.")

TEXT_COL = pick_text_col()

# Satır temizliği
df = df[[TEXT_COL, LABEL_COL]].dropna()
df[TEXT_COL] = df[TEXT_COL].astype(str).str.strip()
df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip()
df = df[(df[TEXT_COL] != "") & (df[LABEL_COL] != "")].reset_index(drop=True)

print("Örnek sayısı:", len(df))
print("Sınıf dağılımı:", Counter(df[LABEL_COL]))


Örnek sayısı: 514
Sınıf dağılımı: Counter({'Teşekkür': 213, 'Yol/Kaldırım': 111, 'Diğer': 55, 'Aydınlatma': 28, 'Çöp/Temizlik': 28, 'İnternet/Elektrik': 26, 'Su/Altyapı': 16, 'Park/Oyun Alanı': 13, 'Hayvan/Sokak Hayvanları': 11, 'Toplu Taşıma': 10, 'İlaçlama/Haşere': 3})


In [6]:
train_df, temp_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df[LABEL_COL]
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=SEED, stratify=temp_df[LABEL_COL]
)

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

# Label encode
le = LabelEncoder()
le.fit(train_df[LABEL_COL])
train_df["label"] = le.transform(train_df[LABEL_COL])
val_df["label"]   = le.transform(val_df[LABEL_COL])
test_df["label"]  = le.transform(test_df[LABEL_COL])

id2label = {i: c for i, c in enumerate(le.classes_)}
label2id = {c: i for i, c in id2label.items()}
num_labels = len(id2label)
print("Sınıflar:", id2label)

# Hugging Face Dataset'e çevir
train_ds = Dataset.from_pandas(train_df[[TEXT_COL, "label"]])
val_ds   = Dataset.from_pandas(val_df[[TEXT_COL, "label"]])
test_ds  = Dataset.from_pandas(test_df[[TEXT_COL, "label"]])
ds = DatasetDict(train=train_ds, validation=val_ds, test=test_ds)


Train: 411 Val: 51 Test: 52
Sınıflar: {0: 'Aydınlatma', 1: 'Diğer', 2: 'Hayvan/Sokak Hayvanları', 3: 'Park/Oyun Alanı', 4: 'Su/Altyapı', 5: 'Teşekkür', 6: 'Toplu Taşıma', 7: 'Yol/Kaldırım', 8: 'Çöp/Temizlik', 9: 'İlaçlama/Haşere', 10: 'İnternet/Elektrik'}


In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch[TEXT_COL], truncation=True, max_length=MAX_LEN)

encoded = ds.map(tokenize_fn, batched=True, remove_columns=[TEXT_COL])
collator = DataCollatorWithPadding(tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/411 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

In [8]:
# Class weights: N / (K * count_i)
train_counts = Counter(train_df["label"])
N = len(train_df)
K = num_labels
weights = torch.tensor([N / (K * train_counts[i]) for i in range(K)], dtype=torch.float)

from torch.nn import CrossEntropyLoss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=weights.to(model.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_labels, id2label=id2label, label2id=label2id
)

metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=preds, references=labels)["accuracy"]
    f1w = metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    f1m = metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    return {"accuracy": acc, "f1_weighted": f1w, "f1_macro": f1m}


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [10]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_steps=50,
    seed=SEED,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()


  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,2.28599,0.27451,0.224361,0.215758
2,2.254600,1.58421,0.666667,0.631181,0.755729
3,2.254600,0.923334,0.901961,0.902952,0.925472
4,1.183800,0.455811,0.960784,0.961631,0.976366
5,1.183800,0.189095,1.0,1.0,1.0
6,0.356700,0.072204,1.0,1.0,1.0
7,0.356700,0.035636,1.0,1.0,1.0
8,0.079900,0.021759,1.0,1.0,1.0
9,0.079900,0.016043,1.0,1.0,1.0
10,0.031900,0.013138,1.0,1.0,1.0


TrainOutput(global_step=520, training_loss=0.3833321354137017, metrics={'train_runtime': 434.0138, 'train_samples_per_second': 18.939, 'train_steps_per_second': 1.198, 'total_flos': 118963045441962.0, 'train_loss': 0.3833321354137017, 'epoch': 20.0})

In [11]:
test_metrics = trainer.evaluate(encoded["test"])
print("TEST:", test_metrics)

# Label mapping ve config sakla
import json
with open(os.path.join(OUTPUT_DIR, "label_mapping.json"), "w", encoding="utf-8") as f:
    json.dump({"id2label": id2label, "label2id": label2id}, f, ensure_ascii=False, indent=2)

trainer.save_model(OUTPUT_DIR)          # model + tokenizer kaydedilir
tokenizer.save_pretrained(OUTPUT_DIR)

print("Model kaydedildi ->", OUTPUT_DIR)


TEST: {'eval_loss': 0.20986731350421906, 'eval_accuracy': 0.9615384615384616, 'eval_f1_weighted': 0.9622252747252747, 'eval_f1_macro': 0.9869047619047618, 'eval_runtime': 0.1454, 'eval_samples_per_second': 357.701, 'eval_steps_per_second': 27.515, 'epoch': 20.0}
Model kaydedildi -> /content/drive/MyDrive/yorumlar/berturk_kategori_modeli


In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def load_pipeline(model_dir=OUTPUT_DIR):
    tok = AutoTokenizer.from_pretrained(model_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_dir)
    mdl.eval()
    return tok, mdl

tok, mdl = load_pipeline()

def predict(texts):
    if isinstance(texts, str):
        texts = [texts]
    inputs = tok(texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    with torch.no_grad():
        logits = mdl(**inputs).logits
    preds = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
    return [id2label[p] for p in preds]

# örnek:
print(predict([
    "Mahallemizde sokak lambaları yanmıyor, çok karanlık.",
    "Ellerinize sağlık, park harika olmuş!",
    "Kaldırım çökmüş ve çukur var.",
    "Sokak köpekleri saldırganlaştı, sabah koşamıyoruz."
]))


['Aydınlatma', 'Teşekkür', 'Yol/Kaldırım', 'Hayvan/Sokak Hayvanları']
