In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


#### LIME

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/home/ubuntu/Otree_Project/models/online_shopping_dataset_60model')
model = AutoModelForSequenceClassification.from_pretrained('/home/ubuntu/Otree_Project/models/online_shopping_dataset_60model', num_labels=2).to("cuda")
# 预测函数 输出分类概率
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return int(probs.argmax())

In [None]:
df = pd.read_excel('/home/ubuntu/Otree_Project/Co-Learning/co_learning/情感分析_20220915.xlsx')
text = df['标题/微博内容'].iloc[:100].to_list()

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from collections import Counter
from lime.lime_text import LimeTextExplainer
class_names = ['消极', '积极']
explainer = LimeTextExplainer(class_names=class_names)
# 数据集
for i in range(96,100):
    exp = explainer.explain_instance(text[i], get_prediction, num_samples=40, labels=(1,))
    exp.save_to_file('/home/ubuntu/Otree_Project/lime_html/lime_exp%d.html' % i,labels=(1,),predict_proba=False,show_predicted_value=False)

#### simulation 寻找70%模型

In [15]:
origin_df = pd.read_csv('/home/ubuntu/Otree_Project/dataset/online_shopping_sentiment.csv')
origin_df = origin_df.sample(frac=1,random_state=1).reset_index(drop=True)
train_df,test_df = train_test_split(origin_df, test_size=0.2, random_state=1)

In [20]:
tokenizer = AutoTokenizer.from_pretrained('/home/ubuntu/Otree_Project/models/online_shopping_dataset_70model')
model = AutoModelForSequenceClassification.from_pretrained('/home/ubuntu/Otree_Project/models/online_shopping_dataset_70model', num_labels=2).to("cuda")

Didn't find file /home/ubuntu/Otree_Project/models/online_shopping_dataset_70model/added_tokens.json. We won't load it.
loading file /home/ubuntu/Otree_Project/models/online_shopping_dataset_70model/vocab.txt
loading file /home/ubuntu/Otree_Project/models/online_shopping_dataset_70model/tokenizer.json
loading file None
loading file /home/ubuntu/Otree_Project/models/online_shopping_dataset_70model/special_tokens_map.json
loading file /home/ubuntu/Otree_Project/models/online_shopping_dataset_70model/tokenizer_config.json
loading configuration file /home/ubuntu/Otree_Project/models/online_shopping_dataset_70model/config.json
Model config BertConfig {
  "_name_or_path": "/home/ubuntu/Otree_Project/models/online_shopping_dataset_70model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidde

In [21]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, cohen_kappa_score
# 评价指标构造
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    kap = cohen_kappa_score(labels, preds)
    return {
      'accuracy': acc, 'kappa':kap
    }
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=12,  # batch size per device during training
    per_device_eval_batch_size=12,   # batch size for evaluation
    learning_rate=1e-5,
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    # load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    evaluation_strategy="epoch",
    # save_strategy="epoch"
)
class My_Dataset(Dataset):
    def __init__(self,dataframe, tokenizer):
        df = dataframe.sample(frac=1.0).reset_index(drop=True)
        self.x = tokenizer(df['text'].tolist(), truncation=True, padding=True, max_length=256)
        self.y = df['label'].tolist()
 
    def __getitem__(self, index):
        item = {k: torch.tensor(v[index]) for k, v in self.x.items()}
        item['label'] = torch.tensor([self.y[index]])
        return item
 
    def __len__(self):
        return len(self.y)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
# 已使用200-350 将表现提升至69 测试100-200/400-500提升不大
test_dataset = My_Dataset(test_df.iloc[:1000],tokenizer)
for i in range(1,2):
    train_dataset = My_Dataset(train_df.iloc[400:500],tokenizer)
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=test_dataset,          # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )
    trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 2
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 18


Epoch,Training Loss,Validation Loss,Accuracy,Kappa
1,No log,0.594238,0.738,0.478071
2,No log,0.571155,0.743,0.484582


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 12
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 12


Training completed. Do not forget to share your model on huggingface.co/models =)




In [18]:
# trainer = Trainer(
#     model=model,                         # the instantiated Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=test_dataset,          # evaluation dataset
#     compute_metrics=compute_metrics,     # the callback that computes metrics of interest
# )
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 12


{'eval_loss': 0.6268118619918823,
 'eval_accuracy': 0.689,
 'eval_kappa': 0.37859654731457804,
 'eval_runtime': 47.4063,
 'eval_samples_per_second': 21.094,
 'eval_steps_per_second': 1.772}