In [1]:
import os
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset

In [2]:
import torch
from collections import Counter
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from sklearn import metrics
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [3]:
# Load the labeled dataset
keys = ['皮肤科', '胸外科', '妇科', '神经外科', '泌尿外科']

df1 = pd.read_excel("..\data\label_final_sentence_level.xlsx")
df1

In [4]:
label_cols = [ '信息支持1-病情描述/分析/诊断',
              '信息支持2-科普读物/其他病人经历',
     '信息支持3-线上/门诊或住院流程/时间/费用',
     '信息支持4-治疗建议',
             '信息支持',
             '情感支持'] 
stats = []
for i, key in enumerate(keys):
    data = df1[df1['key']==key]
    for col in label_cols:
        stats.append([key, col.split('-')[0], data[col].value_counts().values])
for col in label_cols:
    stats.append(['All', col.split('-')[0], df1[col].value_counts().values])

stats = pd.pivot(pd.DataFrame(stats), index=0, columns=1, values=2)
stats

1,信息支持,信息支持1,信息支持2,信息支持3,信息支持4,情感支持
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
All,"[9235, 5193]","[10469, 3959]","[14293, 135]","[13821, 607]","[8439, 5989]","[14196, 232]"
妇科,"[2168, 1109]","[2301, 976]","[3264, 13]","[3093, 184]","[2015, 1262]","[3210, 67]"
泌尿外科,"[1517, 894]","[1830, 581]","[2373, 38]","[2250, 161]","[1414, 997]","[2375, 36]"
皮肤科,"[3033, 1421]","[3299, 1155]","[4416, 38]","[4347, 107]","[2230, 2224]","[4397, 57]"
神经外科,"[1377, 917]","[1613, 681]","[2281, 13]","[2198, 96]","[1468, 826]","[2265, 29]"
胸外科,"[1140, 852]","[1426, 566]","[1959, 33]","[1933, 59]","[1312, 680]","[1949, 43]"


In [6]:
label_cols = [ '信息支持1-病情描述/分析/诊断',
              '信息支持2-科普读物/其他病人经历',
     '信息支持3-线上/门诊或住院流程/时间/费用',
     '信息支持4-治疗建议',
             '信息支持',
             '情感支持']  

In [7]:
# Train model
text_col = '医生回复'
for label_col in label_cols:
    print(label_col)
    train_path = os.path.join('dataset_622', label_col[:5]+'_train.csv')
    val_path = os.path.join('dataset_622', label_col[:5]+'_val.csv')
    test_path = os.path.join('dataset_622', label_col[:5]+'_test.csv')
    data = df1
    columns = [text_col,label_col]
    print(columns)
    df = data[columns].copy()
    df.rename(columns={text_col: 'text'}, inplace=True)
    df.rename(columns={label_col: 'label'}, inplace=True)
    #split train and test
    df_train, df_test = train_test_split(df, test_size=0.40, random_state=42)
    #split validation and test
    df_val, df_test = train_test_split(df_test, test_size=0.50, random_state=42)
    df_test = df_test.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    #save val and test dataset
    df_val.to_csv(val_path, index=False)
    df_test.to_csv(test_path, index=False)
 
    #oversample the train
    label = 'label'
    df_train = df_train.reset_index(drop=True)
    X = df_train.drop(columns=[label])
    y = df_train[label]
    oversample = RandomOverSampler(random_state=42, sampling_strategy='minority')
    X_over, y_over = oversample.fit_resample(X, y)
    print(Counter(y_over))
    df_train_over = pd.DataFrame(X_over, columns=X.columns)
    df_train_over[label] = y_over
    df_train_over.to_csv(train_path, index=False)

信息支持1-病情描述/分析/诊断
['医生回复', '信息支持1-病情描述/分析/诊断']
Counter({0: 8405, 1: 8405})
信息支持2-科普读物/其他病人经历
['医生回复', '信息支持2-科普读物/其他病人经历']
Counter({0: 11431, 1: 11431})
信息支持3-线上/门诊或住院流程/时间/费用
['医生回复', '信息支持3-线上/门诊或住院流程/时间/费用']
Counter({0: 11061, 1: 11061})
信息支持4-治疗建议
['医生回复', '信息支持4-治疗建议']
Counter({0: 6770, 1: 6770})
信息支持
['医生回复', '信息支持']
Counter({0: 7348, 1: 7348})
情感支持
['医生回复', '情感支持']
Counter({0: 11359, 1: 11359})
