In [None]:
!pip install transformers datasets scikit-learn



In [None]:
# بارگذاری کتابخانه‌ها
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

# 1. بارگذاری دیتاست IMDB
dataset = load_dataset("imdb")

# 2. آماده‌سازی داده‌ها: توکنیزه کردن
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


# کاهش تعداد نمونه‌ها برای آموزش و آزمون
small_train = dataset["train"].shuffle(seed=42).select(range(2000))  # 2000 نمونه از آموزش
small_test = dataset["test"].shuffle(seed=42).select(range(500))     # 500 نمونه از تست

# توکنیزه کردن نمونه‌های کوچک
tokenized_train = small_train.map(tokenize_function, batched=True)
tokenized_test = small_test.map(tokenize_function, batched=True)



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# آماده‌سازی داده‌ها
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

# بارگذاری مدل پیش‌آموزش شده BERT
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # پیدا کردن کلاس با بیشترین احتمال
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")  # F1-Score وزنی
    return {"accuracy": acc, "f1": f1}
# تنظیمات آموزش
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)
# تعریف Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
# تنظیم دقیق مدل
trainer.train()

# ارزیابی مدل
results = trainer.evaluate()
print(f"Accuracy: {results['eval_accuracy']}")
print(f"F1 Score: {results['eval_f1']}")


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1436,0.374732,0.904,0.903908
2,0.1227,0.332385,0.914,0.913997


Accuracy: 0.914
F1 Score: 0.913996902996571
