In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


#### LIME

In [2]:
tokenizer = AutoTokenizer.from_pretrained('/home/ubuntu/Otree_Project/models/online_shopping_dataset_60model')
model = AutoModelForSequenceClassification.from_pretrained('/home/ubuntu/Otree_Project/models/online_shopping_dataset_60model', num_labels=2).to("cuda")
# 预测函数 输出分类概率
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return np.array(probs.tolist())

In [3]:
df = pd.read_excel('/home/ubuntu/Otree_Project/Co-Learning/co_learning/spare_dataset.xlsx')
text = df['text'].to_list()

In [4]:
import torch
torch.cuda.empty_cache()

In [6]:
from collections import Counter
from lime.lime_text import LimeTextExplainer
class_names = ['消极', '积极']
explainer = LimeTextExplainer(class_names=class_names)
# 数据集
for i in range(56,57):
    exp = explainer.explain_instance(text[i], get_prediction, num_samples=10, labels=(1,))
    exp.save_to_file('/home/ubuntu/Otree_Project/lime_html_spare/lime_exp%d.html' % i,labels=(1,),predict_proba=False,show_predicted_value=False)

RuntimeError: CUDA out of memory. Tried to allocate 640.00 MiB (GPU 0; 14.76 GiB total capacity; 13.64 GiB already allocated; 213.00 MiB free; 13.65 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

#### simulation 寻找70%模型

In [2]:
origin_df = pd.read_csv('/home/ubuntu/Otree_Project/dataset/online_shopping_sentiment.csv')
origin_df = origin_df.sample(frac=1,random_state=1).reset_index(drop=True)
train_df,test_df = train_test_split(origin_df, test_size=0.2, random_state=1)

In [3]:
tokenizer = AutoTokenizer.from_pretrained('/home/ubuntu/Otree_Project/models/online_shopping_dataset_60model')
model = AutoModelForSequenceClassification.from_pretrained('/home/ubuntu/Otree_Project/models/online_shopping_dataset_60model', num_labels=2).to("cuda")

In [5]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, cohen_kappa_score
# 评价指标构造
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    kap = cohen_kappa_score(labels, preds)
    return {
      'accuracy': acc, 'kappa':kap
    }
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=12,  # batch size per device during training
    per_device_eval_batch_size=12,   # batch size for evaluation
    learning_rate=1e-5,
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    # load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    evaluation_strategy="epoch",
    # save_strategy="epoch"
)
class My_Dataset(Dataset):
    def __init__(self,dataframe, tokenizer):
        df = dataframe.sample(frac=1.0).reset_index(drop=True)
        self.x = tokenizer(df['text'].tolist(), truncation=True, padding=True, max_length=256)
        self.y = df['label'].tolist()
 
    def __getitem__(self, index):
        item = {k: torch.tensor(v[index]) for k, v in self.x.items()}
        item['label'] = torch.tensor([self.y[index]])
        return item
 
    def __len__(self):
        return len(self.y)

In [9]:
# 已使用200-350 将表现提升至69 测试100-200/400-500提升不大

test_dataset = My_Dataset(test_df,tokenizer)
for i in range(1,2):
    train_dataset = My_Dataset(train_df.iloc[100:200],tokenizer)
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=test_dataset,          # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )
    trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 9


Epoch,Training Loss,Validation Loss,Accuracy,Kappa
1,No log,0.496885,0.85,0.693878


***** Running Evaluation *****
  Num examples = 20
  Batch size = 12


Training completed. Do not forget to share your model on huggingface.co/models =)


