In [1]:
import os
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset

In [2]:
import torch
from collections import Counter
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from sklearn import metrics
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [3]:
curr_path = os.getcwd()
curr_path

'/home/anqizhao/Anqi/Bert Classifier'

# Settings

In [4]:
# set model name
model_name = 'bert-base-chinese' 
# set the checkpoint dirctory
checkpoint_dir = 'checkpoints/BERT'
label_cols = [ '信息支持1-病情描述/分析/诊断',
              '信息支持2-科普读物/其他病人经历',
     '信息支持3-线上/门诊或住院流程/时间/费用',
     '信息支持4-治疗建议',
             '信息支持',
             '情感支持'] 
#label_col = '信息支持1-病情描述/分析/诊断'
#label_col = '信息支持2-科普读物/其他病人经历'
#label_col = '信息支持3-线上/门诊或住院流程/时间/费用'
#label_col = '信息支持4-治疗建议'
# label_col = '信息支持'
label_col= '情感支持'

dataset_folder = 'dataset_622'
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)
    
classifier_folder = 'Classifier'
if not os.path.exists(classifier_folder):
    os.makedirs(classifier_folder)

parameter = '_622_5e-5'
train_path = os.path.join(curr_path, dataset_folder, label_col[:5]+'_train.csv')
val_path = os.path.join(curr_path, dataset_folder, label_col[:5]+'_val.csv')
test_path = os.path.join(curr_path,dataset_folder, label_col[:5]+'_test.csv') 
model_path = os.path.join(curr_path, classifier_folder, label_col[:5]+parameter +'_model')
predict_path = os.path.join(curr_path, classifier_folder, label_col[:5]+parameter + '_predict.csv')

In [5]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Train and Predict

In [6]:
# Train model
dataset = load_dataset('csv', data_files={'train': [train_path], 'val': [val_path],'test':[test_path]})
print(dataset)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)
# the dataset.map will avoid the RAM crash in the tokenized process if the dataset is too large
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# 将模型设置为评估模式
#model.eval()
training_args = TrainingArguments(
    output_dir = checkpoint_dir,
    num_train_epochs = 2,
    logging_steps = 10,
    load_best_model_at_end = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    warmup_steps = 100,
#     weight_decay = 0.01,
    logging_dir = 'logs',
    save_total_limit =20,
    seed=seed,
    learning_rate=5e-5
#         learning_rate = 5e-5, 3e-5
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
#train the model
trainer.train()
#save the final model
trainer.save_model(model_path) 

print(label_col)
pred_dataset = tokenized_dataset["test"]
# Run predictions
predictions = trainer.predict(pred_dataset)
# map labels and their meanings
model.config.id2label[0] = '0'
model.config.id2label[1] = '1'
model.config.id2label
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
df_test = pd.read_csv(test_path)
pred_texts = df_test['text'].astype('str').tolist()
original_label = df_test['label'].tolist()
# Create DataFrame with texts, predictions, labels, and prediction scores
df = pd.DataFrame(list(zip(pred_texts,preds,original_label,scores)), columns=['text','pred','original_label','score'])
df['pred'].value_counts()
df.to_csv(predict_path,index=False)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 17028
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2886
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2886
    })
})


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0004,0.057909
2,0.0002,0.033651




情感支持




# Evaluation

In [7]:
print('---------------------------------------')
print(label_col)
df = pd.read_csv(predict_path)
y_ture = df['original_label']
y_pred = df['pred']
acc = metrics.accuracy_score(y_ture, y_pred)
f1 = metrics.f1_score(y_ture, y_pred, labels=[0, 1])
recall = metrics.recall_score(y_ture, y_pred, labels=[0, 1])
precision = metrics.precision_score(y_ture, y_pred, labels=[0, 1])
matrix = metrics.confusion_matrix(y_ture, y_pred, labels=[0, 1])
report = metrics.classification_report(y_ture, y_pred, labels=[0, 1])

print('acc is: '+ str(acc))
print('recall is: '+ str(recall))
print('precision is: '+ str(precision))
print('f1 is: '+ str(f1))
class_names1 = ['True Negative', 'True Positive']
class_names2 = ['Pred Negative', 'Pred Positive']
df_cm = pd.DataFrame(matrix, index=class_names1, columns=class_names2)
print(df_cm)
print(report)
print('---------------------------------------')

---------------------------------------
情感支持
acc is: 0.9948024948024948
recall is: 0.8181818181818182
precision is: 0.8372093023255814
f1 is: 0.8275862068965518
               Pred Negative  Pred Positive
True Negative           2835              7
True Positive              8             36
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2842
           1       0.84      0.82      0.83        44

    accuracy                           0.99      2886
   macro avg       0.92      0.91      0.91      2886
weighted avg       0.99      0.99      0.99      2886

---------------------------------------
