# 本文的研究目的是 <code>情感分析</code>，但是通过大语言模型 <code>bert</code> 来做
# 1.0 预训练与微调

In [1]:
###准备训练集与测试集


In [2]:
import os
import logging
import sys

sys.path.append('../bertTools')
from bertTools.utils import logger_init
from bertTools.model import BertConfig
from bertTools.model import BertForPretrainingModel
from bertTools.utils import LoadBertPretrainingDataset
from transformers import BertTokenizer
from transformers import AdamW
from transformers import get_polynomial_decay_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy
import torch
import time


class ModelConfig:
    def __init__(self):
        # self.project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        self.project_dir=os.getcwd()

        # ========== wike2 数据集相关配置
        # self.dataset_dir = os.path.join(self.project_dir, 'data', 'WikiText')
        # self.pretrained_model_dir = os.path.join(self.project_dir, "bert_base_uncased_english")
        # self.train_file_path = os.path.join(self.dataset_dir, 'wiki.train.tokens')
        # self.val_file_path = os.path.join(self.dataset_dir, 'wiki.valid.tokens')
        # self.test_file_path = os.path.join(self.dataset_dir, 'wiki.test.tokens')
        # self.data_name = 'wiki2'

        # ========== songci 数据集相关配置
        self.dataset_dir = os.path.join(self.project_dir, 'bertTools', 'SongCi')
        self.pretrained_model_dir = os.path.join(self.project_dir, 'bertTools',"bert_google_1_L-12_H-768_A-12_cn")
        self.train_file_path = os.path.join(self.dataset_dir, 'songci.train.txt')
        self.val_file_path = os.path.join(self.dataset_dir, 'songci.valid.txt')
        self.test_file_path = os.path.join(self.dataset_dir, 'songci.test.txt')
        self.data_name = 'model'

        # 如果需要切换数据集，只需要更改上面的配置即可
        self.vocab_path = os.path.join(self.pretrained_model_dir, 'vocab.txt')
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.model_save_dir = os.path.join(self.project_dir,'bertTools','result', 'cache')
        self.logs_save_dir = os.path.join(self.project_dir, 'bertTools','result','logs')
        self.model_save_path = os.path.join(self.model_save_dir, 'modelResult',f'model_{self.data_name}.bin')
        self.writer = SummaryWriter(f"bertTools/result/{self.data_name}")
        self.is_sample_shuffle = True
        self.use_embedding_weight = True
        self.batch_size = 16
        self.max_sen_len = None  # 为None时则采用每个batch中最长的样本对该batch中的样本进行padding
        self.pad_index = 0
        self.random_state = 2022
        self.learning_rate = 4e-5
        self.weight_decay = 0.1
        self.masked_rate = 0.15
        self.masked_token_rate = 0.8
        self.masked_token_unchanged_rate = 0.5
        self.log_level = logging.DEBUG
        self.use_torch_multi_head = False  # False表示使用model/BasicBert/MyTransformer中的多头实现
        self.epochs = 200
        self.model_val_per_epoch = 1

        logger_init(log_file_name=self.data_name, log_level=self.log_level,
                    log_dir=self.logs_save_dir)
        if not os.path.exists(self.model_save_dir):
            os.makedirs(self.model_save_dir)
        bert_config_path = os.path.join(self.pretrained_model_dir, "config.json")
        bert_config = BertConfig.from_json_file(bert_config_path)
        for key, value in bert_config.__dict__.items():
            self.__dict__[key] = value
        # 将当前配置打印到日志文件中
        logging.info(" ### 将当前配置打印到日志文件中 ")
        for key, value in self.__dict__.items():
            logging.info(f"### {key} = {value}")


def train(config):
    model = BertForPretrainingModel(config,
                                    config.pretrained_model_dir)
    last_epoch = -1
    if os.path.exists(config.model_save_path):
        checkpoint = torch.load(config.model_save_path)
        last_epoch = checkpoint['last_epoch']
        loaded_paras = checkpoint['model_state_dict']
        model.load_state_dict(loaded_paras)
        logging.info("## 成功载入已有模型，进行追加训练......")
    model = model.to(config.device)
    model.train()
    bert_tokenize = BertTokenizer.from_pretrained(config.pretrained_model_dir).tokenize
    data_loader = LoadBertPretrainingDataset(vocab_path=config.vocab_path,
                                             tokenizer=bert_tokenize,
                                             batch_size=config.batch_size,
                                             max_sen_len=config.max_sen_len,
                                             max_position_embeddings=config.max_position_embeddings,
                                             pad_index=config.pad_index,
                                             is_sample_shuffle=config.is_sample_shuffle,
                                             random_state=config.random_state,
                                             data_name=config.data_name,
                                             masked_rate=config.masked_rate,
                                             masked_token_rate=config.masked_token_rate,
                                             masked_token_unchanged_rate=config.masked_token_unchanged_rate)
    train_iter, test_iter, val_iter = \
        data_loader.load_train_val_test_data(test_file_path=config.test_file_path,
                                             train_file_path=config.train_file_path,
                                             val_file_path=config.val_file_path)
    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": config.weight_decay,
            "initial_lr": config.learning_rate

        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
            "initial_lr": config.learning_rate
        },
    ]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = AdamW(optimizer_grouped_parameters)
    scheduler = get_polynomial_decay_schedule_with_warmup(optimizer,
                                                          int(len(train_iter) * 0),
                                                          int(config.epochs * len(train_iter)),
                                                          last_epoch=last_epoch)
    max_acc = 0
    state_dict = None
    for epoch in range(config.epochs):
        losses = 0
        start_time = time.time()
        for idx, (b_token_ids, b_segs, b_mask, b_mlm_label, b_nsp_label) in enumerate(train_iter):
            b_token_ids = b_token_ids.to(config.device)  # [src_len, batch_size]
            b_segs = b_segs.to(config.device)
            b_mask = b_mask.to(config.device)
            b_mlm_label = b_mlm_label.to(config.device)
            b_nsp_label = b_nsp_label.to(config.device)
            loss, mlm_logits, nsp_logits = model(input_ids=b_token_ids,
                                                 attention_mask=b_mask,
                                                 token_type_ids=b_segs,
                                                 masked_lm_labels=b_mlm_label,
                                                 next_sentence_labels=b_nsp_label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            losses += loss.item()
            mlm_acc, _, _, nsp_acc, _, _ = accuracy(mlm_logits, nsp_logits, b_mlm_label,
                                                    b_nsp_label, data_loader.PAD_IDX)
            if idx % 20 == 0:
                logging.info(f"Epoch: [{epoch + 1}/{config.epochs}], Batch[{idx}/{len(train_iter)}], "
                             f"Train loss :{loss.item():.3f}, Train mlm acc: {mlm_acc:.3f},"
                             f"nsp acc: {nsp_acc:.3f}")
                config.writer.add_scalar('Training/Loss', loss.item(), scheduler.last_epoch)
                config.writer.add_scalar('Training/Learning Rate', scheduler.get_last_lr()[0], scheduler.last_epoch)
                config.writer.add_scalars(main_tag='Training/Accuracy',
                                          tag_scalar_dict={'NSP': nsp_acc,
                                                           'MLM': mlm_acc},
                                          global_step=scheduler.last_epoch)
        end_time = time.time()
        train_loss = losses / len(train_iter)
        logging.info(f"Epoch: [{epoch + 1}/{config.epochs}], Train loss: "
                     f"{train_loss:.3f}, Epoch time = {(end_time - start_time):.3f}s")
        if (epoch + 1) % config.model_val_per_epoch == 0:
            mlm_acc, nsp_acc = evaluate(config, val_iter, model, data_loader.PAD_IDX)
            logging.info(f" ### MLM Accuracy on val: {round(mlm_acc, 4)}, "
                         f"NSP Accuracy on val: {round(nsp_acc, 4)}")
            config.writer.add_scalars(main_tag='Testing/Accuracy',
                                      tag_scalar_dict={'NSP': nsp_acc,
                                                       'MLM': mlm_acc},
                                      global_step=scheduler.last_epoch)
            # mlm_acc, nsp_acc = evaluate(config, train_iter, model, data_loader.PAD_IDX)
            if mlm_acc > max_acc:
                max_acc = mlm_acc
                state_dict = deepcopy(model.state_dict())
            torch.save({'last_epoch': scheduler.last_epoch,
                        'model_state_dict': state_dict},
                       config.model_save_path)


def accuracy(mlm_logits, nsp_logits, mlm_labels, nsp_label, PAD_IDX):
    """
    :param mlm_logits:  [src_len,batch_size,src_vocab_size]
    :param mlm_labels:  [src_len,batch_size]
    :param nsp_logits:  [batch_size,2]
    :param nsp_label:  [batch_size]
    :param PAD_IDX:
    :return:
    """
    mlm_pred = mlm_logits.transpose(0, 1).argmax(axis=2).reshape(-1)
    # 将 [src_len,batch_size,src_vocab_size] 转成 [batch_size, src_len,src_vocab_size]
    mlm_true = mlm_labels.transpose(0, 1).reshape(-1)
    # 将 [src_len,batch_size] 转成 [batch_size， src_len]
    mlm_acc = mlm_pred.eq(mlm_true)  # 计算预测值与正确值比较的情况，得到预测正确的个数（此时还包括有mask位置）
    mask = torch.logical_not(mlm_true.eq(PAD_IDX))  # 找到真实标签中，mask位置的信息。 mask位置为FALSE，非mask位置为TRUE
    mlm_acc = mlm_acc.logical_and(mask)  # 去掉mlm_acc中mask的部分
    mlm_correct = mlm_acc.sum().item()
    mlm_total = mask.sum().item()
    mlm_acc = float(mlm_correct) / mlm_total

    nsp_correct = (nsp_logits.argmax(1) == nsp_label).float().sum()
    nsp_total = len(nsp_label)
    nsp_acc = float(nsp_correct) / nsp_total
    return [mlm_acc, mlm_correct, mlm_total, nsp_acc, nsp_correct, nsp_total]


def evaluate(config, data_iter, model, PAD_IDX):
    model.eval()
    mlm_corrects, mlm_totals, nsp_corrects, nsp_totals = 0, 0, 0, 0
    with torch.no_grad():
        for idx, (b_token_ids, b_segs, b_mask, b_mlm_label, b_nsp_label) in enumerate(data_iter):
            b_token_ids = b_token_ids.to(config.device)  # [src_len, batch_size]
            b_segs = b_segs.to(config.device)
            b_mask = b_mask.to(config.device)
            b_mlm_label = b_mlm_label.to(config.device)
            b_nsp_label = b_nsp_label.to(config.device)
            mlm_logits, nsp_logits = model(input_ids=b_token_ids,
                                           attention_mask=b_mask,
                                           token_type_ids=b_segs)
            result = accuracy(mlm_logits, nsp_logits, b_mlm_label, b_nsp_label, PAD_IDX)
            _, mlm_cor, mlm_tot, _, nsp_cor, nsp_tot = result
            mlm_corrects += mlm_cor
            mlm_totals += mlm_tot
            nsp_corrects += nsp_cor
            nsp_totals += nsp_tot
    model.train()
    return [float(mlm_corrects) / mlm_totals, float(nsp_corrects) / nsp_totals]


def inference(config, sentences=None, masked=False, language='en', random_state=None):
    """
    :param config:
    :param sentences:
    :param masked: 推理时的句子是否Mask
    :param language: 语种
    :param random_state:  控制mask字符时的随机状态
    :return:
    """
    bert_tokenize = BertTokenizer.from_pretrained(config.pretrained_model_dir).tokenize
    data_loader = LoadBertPretrainingDataset(vocab_path=config.vocab_path,
                                             tokenizer=bert_tokenize,
                                             pad_index=config.pad_index,
                                             random_state=config.random_state,
                                             masked_rate=0.15)  # 15% Mask掉
    token_ids, pred_idx, mask = data_loader.make_inference_samples(sentences,
                                                                   masked=masked,
                                                                   language=language,
                                                                   random_state=random_state)
    model = BertForPretrainingModel(config,
                                    config.pretrained_model_dir)
    if os.path.exists(config.model_save_path):
        checkpoint = torch.load(config.model_save_path)
        loaded_paras = checkpoint['model_state_dict']
        model.load_state_dict(loaded_paras)
        logging.info("## 成功载入已有模型进行推理......")
    else:
        raise ValueError(f"模型 {config.model_save_path} 不存在！")
    model = model.to(config.device)
    model.eval()
    with torch.no_grad():
        token_ids = token_ids.to(config.device)  # [src_len, batch_size]
        mask = mask.to(config.device)
        mlm_logits, _ = model(input_ids=token_ids,
                              attention_mask=mask)
    pretty_print(token_ids, mlm_logits, pred_idx,
                 data_loader.vocab.itos, sentences, language)


def pretty_print(token_ids, logits, pred_idx, itos, sentences, language):
    """
    格式化输出结果
    :param token_ids:   [src_len, batch_size]
    :param logits:  [src_len, batch_size, vocab_size]
    :param pred_idx:   二维列表，每个内层列表记录了原始句子中被mask的位置
    :param itos:
    :param sentences: 原始句子
    :return:
    """
    token_ids = token_ids.transpose(0, 1)  # [batch_size,src_len]
    logits = logits.transpose(0, 1)  # [batch_size, src_len,vocab_size]
    y_pred = logits.argmax(axis=2)  # [batch_size, src_len]
    sep = " " if language == 'en' else ""
    for token_id, sentence, y, y_idx in zip(token_ids, sentences, y_pred, pred_idx):
        sen = [itos[id] for id in token_id]
        sen_mask = sep.join(sen).replace(" ##", "").replace("[PAD]", "").replace(" ,", ",")
        sen_mask = sen_mask.replace(" .", ".").replace("[SEP]", "").replace("[CLS]", "").lstrip()
        logging.info(f" ### 原始: {sentence}")
        logging.info(f"  ## 掩盖: {sen_mask}")
        for idx in y_idx:
            sen[idx] = itos[y[idx]].replace("##", "")
        sen = sep.join(sen).replace("[PAD]", "").replace(" ,", ",")
        sen = sen.replace(" .", ".").replace("[SEP]", "").replace("[CLS]", "").lstrip()
        logging.info(f"  ## 预测: {sen}")
        logging.info("===============")




2023-07-16 15:48:46.873146: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
config = ModelConfig()
train(config)
# sentences_1 = ["I no longer love her, true, but perhaps I love her.",
#                "Love is so short and oblivion so long."]
# sentences_2 = ["十年生死两茫茫。不思量。自难忘。千里孤坟，无处话凄凉。",
#                "红酥手。黄藤酒。满园春色宫墙柳。"]
# inference(config, sentences_2, masked=False, language='zh',random_state=2022)

[2023-07-16 15:48:49] - INFO: 成功导入BERT配置文件 /Users/czc/PycharmProjects/stockMarketAnalysis/bertTools/bert_google_1_L-12_H-768_A-12_cn/config.json
[2023-07-16 15:48:49] - INFO:  ### 将当前配置打印到日志文件中 
[2023-07-16 15:48:49] - INFO: ### project_dir = /Users/czc/PycharmProjects/stockMarketAnalysis
[2023-07-16 15:48:49] - INFO: ### dataset_dir = /Users/czc/PycharmProjects/stockMarketAnalysis/bertTools/SongCi
[2023-07-16 15:48:49] - INFO: ### pretrained_model_dir = /Users/czc/PycharmProjects/stockMarketAnalysis/bertTools/bert_google_1_L-12_H-768_A-12_cn
[2023-07-16 15:48:49] - INFO: ### train_file_path = /Users/czc/PycharmProjects/stockMarketAnalysis/bertTools/SongCi/songci.train.txt
[2023-07-16 15:48:49] - INFO: ### val_file_path = /Users/czc/PycharmProjects/stockMarketAnalysis/bertTools/SongCi/songci.valid.txt
[2023-07-16 15:48:49] - INFO: ### test_file_path = /Users/czc/PycharmProjects/stockMarketAnalysis/bertTools/SongCi/songci.test.txt
[2023-07-16 15:48:49] - INFO: ### data_name = model
[202

ValueError: 数据 model 不存在对应的格式化函数，请参考函数 read_wiki(filepath) 实现对应的格式化函数！

# 1.1 分类测试

In [None]:
# ### todo bert 情绪分析，从网上复制的，看看情况怎么样。 这一段的代码更多的是测试，需要提供文本预训练与最终分类训练结果,目前还没测试，后续准备好训练与预训练就开始

# ### https://github.com/rsanshierli/EasyBert/tree/master/Sentiment
# import re
# import torch
# import torch.nn as nn
# import numpy as np
# from pytorch_pretrained import BertModel, BertTokenizer
#
#
# class Config(object):
#
#     """配置参数"""
#     def __init__(self):
#         self.model_name = 'bert'
#         self.class_list = ['中性', '积极', '消极']          # 类别名单
#         self.save_path = './Sentiment/saved_dict/bert.ckpt'        # 模型训练结果
#         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备
#
#         self.require_improvement = 1000                                 # 若超过1000batch效果还没提升，则提前结束训练
#         self.num_classes = len(self.class_list)                         # 类别数
#         self.num_epochs = 3                                             # epoch数
#         self.batch_size = 128                                           # mini-batch大小
#         self.pad_size = 32                                              # 每句话处理成的长度(短填长切)
#         self.learning_rate = 5e-5                                       # 学习率
#         self.bert_path = './bert_pretrain'
#         self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
#         self.hidden_size = 768
#
#
# class Model(nn.Module):
#
#     def __init__(self, config):
#         super(Model, self).__init__()
#         self.bert = BertModel.from_pretrained(config.bert_path)
#         for param in self.bert.parameters():
#             param.requires_grad = True
#         self.fc = nn.Linear(config.hidden_size, config.num_classes)
#
#     def forward(self, x):
#         context = x[0]  # 输入的句子
#         mask = x[2]  # 对padding部分进行mask，和句子一个size，padding部分用0表示，如：[1, 1, 1, 1, 0, 0]
#         _, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
#         out = self.fc(pooled)
#         return out
#
#
# PAD, CLS = '[PAD]', '[CLS]'  # padding符号, bert中综合信息符号
#
# def clean(text):
#     # text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", " ", text)  # 去除正文中的@和回复/转发中的用户名
#     # text = re.sub(r"\[\S+\]", "", text)  # 去除表情符号
#     # text = re.sub(r"#\S+#", "", text)  # 保留话题内容
#     URL_REGEX = re.compile(
#         r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
#         re.IGNORECASE)
#     text = re.sub(URL_REGEX, "", text)  # 去除网址
#     text = text.replace("转发微博", "")  # 去除无意义的词语
#     text = re.sub(r"\s+", " ", text)  # 合并正文中过多的空格
#     return text.strip()
#
# def load_dataset(data, config):
#     pad_size = config.pad_size
#     contents = []
#     for line in data:
#         lin = clean(line)
#         token = config.tokenizer.tokenize(lin)      # 分词
#         token = [CLS] + token                           # 句首加入CLS
#         seq_len = len(token)
#         mask = []
#         token_ids = config.tokenizer.convert_tokens_to_ids(token)
#
#         if pad_size:
#             if len(token) < pad_size:
#                 mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
#                 token_ids += ([0] * (pad_size - len(token)))
#             else:
#                 mask = [1] * pad_size
#                 token_ids = token_ids[:pad_size]
#                 seq_len = pad_size
#         contents.append((token_ids, int(0), seq_len, mask))
#     return contents
#
# class DatasetIterater(object):
#     def __init__(self, batches, batch_size, device):
#         self.batch_size = batch_size
#         self.batches = batches     # data
#         self.n_batches = len(batches) // batch_size
#         self.residue = False  # 记录batch数量是否为整数
#         if len(batches) % self.n_batches != 0:
#             self.residue = True
#         self.index = 0
#         self.device = device
#
#     def _to_tensor(self, datas):
#         x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
#         y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
#
#         # pad前的长度(超过pad_size的设为pad_size)
#         seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
#         mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
#         return (x, seq_len, mask), y
#
#     def __next__(self):     # 返回下一个迭代器对象，必须控制结束条件
#         if self.residue and self.index == self.n_batches:
#             batches = self.batches[self.index * self.batch_size: len(self.batches)]
#             self.index += 1
#             batches = self._to_tensor(batches)
#             return batches
#
#         elif self.index >= self.n_batches:
#             self.index = 0
#             raise StopIteration
#         else:
#             batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
#             self.index += 1
#             batches = self._to_tensor(batches)
#             return batches
#
#     def __iter__(self):     # 返回一个特殊的迭代器对象，这个迭代器对象实现了 __next__() 方法并通过 StopIteration 异常标识迭代的完成。
#         return self
#
#     def __len__(self):
#         if self.residue:
#             return self.n_batches + 1
#         else:
#             return self.n_batches
#
#
# def build_iterator(dataset, config):
#     iter = DatasetIterater(dataset, 1, config.device)
#     return iter
#
#
# def match_label(pred, config):
#     label_list = config.class_list
#     return label_list[pred]
#
#
# def final_predict(config, model, data_iter):
#     map_location = lambda storage, loc: storage
#     model.load_state_dict(torch.load(config.save_path, map_location=map_location))
#     model.eval()
#     predict_all = np.array([])
#     with torch.no_grad():
#         for texts, _ in data_iter:
#             outputs = model(texts)
#             pred = torch.max(outputs.data, 1)[1].cpu().numpy()
#             pred_label = [match_label(i, config) for i in pred]
#             predict_all = np.append(predict_all, pred_label)
#
#     return predict_all
#
# def main(text):
#     config = Config()
#     model = Model(config).to(config.device)
#     test_data = load_dataset(text, config)
#     test_iter = build_iterator(test_data, config)
#     result = final_predict(config, model, test_iter)
#     for i, j in enumerate(result):
#         print('text:{}'.format(text[i]))
#         print('label:{}'.format(j))
#
#
# if __name__ == '__main__':
#
#     test = ['#你好2020#新年第一天元气满满的早起出门买早饭结果高估了自己抗冻能力回家成功冻发烧（大概是想告诉我2020要量力而行）然鹅这并不影响后续计划一出门立马生龙活虎新年和新??更配哦??看了误杀吃了大餐就让新的一年一直这样美滋滋下去吧??',
#             '大宝又感冒鼻塞咳嗽了，还有发烧。队友加班几天不回。感觉自己的情绪在家已然是随时引爆的状态。情绪一上来，容易对孩子说出自己都想不到的话来……2020年，真的要学会控制情绪，管理好家人健康。这是今年最大的目标。?',
#             '还要去输两天液，这天也太容易感冒发烧了，一定要多喝热水啊?',
#             '我太难了别人怎么发烧都没事就我一检查甲型流感?',
#             '果然是要病一场的喽回来第三天开始感冒今儿还发烧了喉咙眼睛都难受的一匹怎么样能不经意让我的毕设导师看到这条微博并给我放一天假呢?']
#     main(test)