In [1]:
pip install xgboost




In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

# 1. Veriyi yükle
df = pd.read_csv("all_texts.csv")  # Kendi CSV dosyanı buraya koy
texts = df["text"].tolist()
labels = df["author"].astype("category")
y = labels.cat.codes  # Sayısal etiketler
label_map = dict(enumerate(labels.cat.categories))  # Etiket eşlemesi

# 2. Eğitim-test ayırımı
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, random_state=42
)

# 3. BERT Tokenizer ve Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()

# 4. BERT ile metinleri vektöre çevir
def get_bert_embeddings(text_list):
    embeddings = []
    for text in tqdm(text_list):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] vektörü
        embeddings.append(cls_embedding)
    return embeddings

X_train_vec = get_bert_embeddings(X_train_texts)
X_test_vec = get_bert_embeddings(X_test_texts)

# 5. Modelleri tanımla
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=300),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

# 6. Model eğitimi ve değerlendirme
results = []

for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    })

# 7. Sonuçları tablo olarak göster
results_df = pd.DataFrame(results)
print(results_df)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 960/960 [29:57<00:00,  1.87s/it]
100%|██████████| 240/240 [07:26<00:00,  1.86s/it]
Parameters: { "use_label_encoder" } are not used.



           Model  Accuracy  Precision    Recall  F1-Score
0  Random Forest  0.508333   0.546224  0.519348  0.496556
1            SVM  0.220833   0.254114  0.265014  0.198102
2        XGBoost  0.458333   0.481367  0.477428  0.456192
3    Naive Bayes  0.504167   0.533043  0.502009  0.497400
4            MLP  0.604167   0.628874  0.626324  0.613567
5  Decision Tree  0.237500   0.263950  0.264953  0.243131
