In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/liar-dataset/test.tsv
/kaggle/input/liar-dataset/README
/kaggle/input/liar-dataset/train.tsv
/kaggle/input/liar-dataset/valid.tsv


In [2]:
import pandas as pd

# 讀取 Kaggle 數據集
train_path = "/kaggle/input/liar-dataset/train.tsv"
valid_path = "/kaggle/input/liar-dataset/valid.tsv"
test_path  = "/kaggle/input/liar-dataset/test.tsv"

columns = ["id", "label", "statement", "subject", "speaker", "job_title", "state_info",
           "party_affiliation", "barely_true_counts", "false_counts", 
           "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"]

train_df = pd.read_csv(train_path, delimiter='\t', header=None, names=columns)
valid_df = pd.read_csv(valid_path, delimiter='\t', header=None, names=columns)
test_df = pd.read_csv(test_path, delimiter='\t', header=None, names=columns)

# 查看數據
print(train_df.head())

           id        label                                          statement  \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                              subject         speaker             job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info party_affiliation  barely

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [4]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate
import torch
import os


# 1️⃣ 加載 tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# 2️⃣ 數據預處理
def preprocess_data(df):
    return tokenizer(df["statement"], padding="max_length", truncation=True, max_length=128)

label_mapping = {"false": 0, "true": 1, "half-true": 2, "barely-true": 3, "mostly-true": 4, "pants-fire": 5}
train_df["label"] = train_df["label"].map(label_mapping)
valid_df["label"] = valid_df["label"].map(label_mapping)

train_dataset = Dataset.from_pandas(train_df[["statement", "label"]])
valid_dataset = Dataset.from_pandas(valid_df[["statement", "label"]])

train_dataset = train_dataset.map(preprocess_data, batched=True)
valid_dataset = valid_dataset.map(preprocess_data, batched=True)

# 3️⃣ 加載模型
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

# 4️⃣ 訓練參數
os.makedirs("./results", exist_ok=True)
os.makedirs("./logs", exist_ok=True)

training_args = TrainingArguments(
    output_dir="./results",
    report_to="none",  # 停用 W&B
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)    

print("Output directory:", training_args.output_dir)
# 5️⃣ 訓練模型
from transformers import DataCollatorWithPadding

# 使用 DataCollatorWithPadding 來確保批次內的長度一致
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,  # 替代 tokenizer
)
trainer.train()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Output directory: ./results




Epoch,Training Loss,Validation Loss
1,1.7288,1.703796
2,1.614,1.698482
3,1.4343,1.825044




TrainOutput(global_step=1920, training_loss=1.5212613423665364, metrics={'train_runtime': 319.9678, 'train_samples_per_second': 96.01, 'train_steps_per_second': 6.001, 'total_flos': 1017422193623040.0, 'train_loss': 1.5212613423665364, 'epoch': 3.0})

In [5]:
model.save_pretrained("/kaggle/working/distilbert-liar")
tokenizer.save_pretrained("/kaggle/working/distilbert-liar")

('/kaggle/working/distilbert-liar/tokenizer_config.json',
 '/kaggle/working/distilbert-liar/special_tokens_map.json',
 '/kaggle/working/distilbert-liar/vocab.txt',
 '/kaggle/working/distilbert-liar/added_tokens.json')