<a href="https://colab.research.google.com/github/Swinden/Study/blob/main/demo2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m37.1 MB/s[0m eta [36m0:00:0

In [None]:
#导入前置依赖
import os
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
# 用于加载bert模型的分词器
from transformers import AutoTokenizer
# 用于加载bert模型
from transformers import BertModel
from pathlib import Path

In [None]:
batch_size = 16
# 文本的最大长度
text_max_length = 300
# 总训练的epochs数，我只是随便定义了个数
epochs = 20
# 学习率
lr = 5e-6
# 取多少训练集的数据作为验证集
validation_ratio = 0.11
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 每多少步，打印一次loss
log_per_step = 50

# 数据集所在位置
dataset_dir = Path("/content/drive/MyDrive/Colab Notebooks/data1")
os.makedirs(dataset_dir) if not os.path.exists(dataset_dir) else ''

# 模型存储路径
model_dir = Path("/content/drive/MyDrive/Colab Notebooks/model/bert_checkpoints")
# 如果模型目录不存在，则创建一个
os.makedirs(model_dir) if not os.path.exists(model_dir) else ''

print("Device:", device)

Device: cuda


In [None]:
# 读取数据集，进行数据处理

pd_train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data1/train.csv')
pd_train_data['title'] = pd_train_data['title'].fillna('')
pd_train_data['abstract'] = pd_train_data['abstract'].fillna('')

test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data2/testB.csv')
test_data['title'] = test_data['title'].fillna('')
test_data['abstract'] = test_data['abstract'].fillna('')
pd_train_data['text'] = pd_train_data['title'].fillna('') + ' ' +  pd_train_data['author'].fillna('') + ' ' + pd_train_data['abstract'].fillna('')+ ' ' + pd_train_data['Keywords'].fillna('')
test_data['text'] = test_data['title'].fillna('') + ' ' +  test_data['author'].fillna('') + ' ' + test_data['abstract'].fillna('')+ ' ' + pd_train_data['Keywords'].fillna('')


In [None]:
# 从训练集中随机采样测试集
validation_data = pd_train_data.sample(frac=validation_ratio)
train_data = pd_train_data[~pd_train_data.index.isin(validation_data.index)]

In [None]:
# 构建Dataset
class MyDataset(Dataset):

    def __init__(self, mode='train'):
        super(MyDataset, self).__init__()
        self.mode = mode
        # 拿到对应的数据
        if mode == 'train':
            self.dataset = train_data
        elif mode == 'validation':
            self.dataset = validation_data
        elif mode == 'test':
            # 如果是测试模式，则返回内容和uuid。拿uuid做target主要是方便后面写入结果。
            self.dataset = test_data
        else:
            raise Exception("Unknown mode {}".format(mode))

    def __getitem__(self, index):
        # 取第index条
        data = self.dataset.iloc[index]
        # 取其内容
        text = data['text']
        # 根据状态返回内容
        if self.mode == 'test':
            # 如果是test，将uuid做为target
            label = data['uuid']
        else:
            label = data['label']
        # 返回内容和label
        return text, label

    def __len__(self):
        return len(self.dataset)


In [None]:
train_dataset = MyDataset('train')
validation_dataset = MyDataset('validation')

In [None]:
train_dataset.__getitem__(0)

('Accessible Visual Artworks for Blind and Visually Impaired People: Comparing a Multimodal Approach with Tactile Graphics Quero, Luis Cavazos; Bartolome, Jorge Iranzo; Cho, Jundong Despite the use of tactile graphics and audio guides, blind and visually impaired people still face challenges to experience and understand visual artworks independently at art exhibitions. Art museums and other art places are increasingly exploring the use of interactive guides to make their collections more accessible. In this work, we describe our approach to an interactive multimodal guide prototype that uses audio and tactile modalities to improve the autonomous access to information and experience of visual artworks. The prototype is composed of a touch-sensitive 2.5D artwork relief model that can be freely explored by touch. Users can access localized verbal descriptions and audio by performing touch gestures on the surface while listening to themed background music along. We present the design requi

In [None]:
#获取Bert预训练模型
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
#接着构造我们的Dataloader。
#我们需要定义一下collate_fn，在其中完成对句子进行编码、填充、组装batch等动作：
def collate_fn(batch):
    """
    将一个batch的文本句子转成tensor，并组成batch。
    :param batch: 一个batch的句子，例如: [('推文', target), ('推文', target), ...]
    :return: 处理后的结果，例如：
             src: {'input_ids': tensor([[ 101, ..., 102, 0, 0, ...], ...]), 'attention_mask': tensor([[1, ..., 1, 0, ...], ...])}
             target：[1, 1, 0, ...]
    """
    text, label = zip(*batch)
    text, label = list(text), list(label)

    # src是要送给bert的，所以不需要特殊处理，直接用tokenizer的结果即可
    # padding='max_length' 不够长度的进行填充
    # truncation=True 长度过长的进行裁剪
    src = tokenizer(text, padding='max_length', max_length=text_max_length, return_tensors='pt', truncation=True)

    return src, torch.LongTensor(label)

In [None]:
#接着构造我们的Dataloader。
#我们需要定义一下collate_fn，在其中完成对句子进行编码、填充、组装batch等动作：
def collate_fn(batch):
    """
    将一个batch的文本句子转成tensor，并组成batch。
    :param batch: 一个batch的句子，例如: [('推文', target), ('推文', target), ...]
    :return: 处理后的结果，例如：
             src: {'input_ids': tensor([[ 101, ..., 102, 0, 0, ...], ...]), 'attention_mask': tensor([[1, ..., 1, 0, ...], ...])}
             target：[1, 1, 0, ...]
    """
    text, label = zip(*batch)
    text, label = list(text), list(label)

    # src是要送给bert的，所以不需要特殊处理，直接用tokenizer的结果即可
    # padding='max_length' 不够长度的进行填充
    # truncation=True 长度过长的进行裁剪
    src = tokenizer(text, padding='max_length', max_length=text_max_length, return_tensors='pt', truncation=True)

    return src, torch.LongTensor(label)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
inputs, targets = next(iter(train_loader))
print("inputs:", inputs)
print("targets:", targets)

inputs: {'input_ids': tensor([[  101, 26242,  7722,  ...,  1043,  1011,   102],
        [  101,  1996,  2373,  ...,     0,     0,     0],
        [  101,  2079, 18923,  ...,  1006,  1038,   102],
        ...,
        [  101,  6490,  3120,  ...,     0,     0,     0],
        [  101, 10723, 18440,  ...,  2475,  1998,   102],
        [  101,  8382, 11326,  ...,  1016, 12978,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}
targets: tensor([1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1])


In [None]:
#定义预测模型，该模型由bert模型加上最后的预测层组成
class MyModel(nn.Module):

    def __init__(self):
        super(MyModel, self).__init__()

        # 加载bert模型
        self.bert = BertModel.from_pretrained('bert-base-uncased', mirror='tuna')

        # 最后的预测层
        self.predictor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, src):
        """
        :param src: 分词后的推文数据
        """

        # 将src直接序列解包传入bert，因为bert和tokenizer是一套的，所以可以这么做。
        # 得到encoder的输出，用最前面[CLS]的输出作为最终线性层的输入
        outputs = self.bert(**src).last_hidden_state[:, 0, :]

        # 使用线性层来做最终的预测
        return self.predictor(outputs)


In [None]:
model = MyModel()
model = model.to(device)

In [None]:
#定义出损失函数和优化器。这里使用Binary Cross Entropy：
criteria = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
# 由于inputs是字典类型的，定义一个辅助函数帮助to(device)
def to_device(dict_tensors):
    result_tensors = {}
    for key, value in dict_tensors.items():
        result_tensors[key] = value.to(device)
    return result_tensors

In [None]:
#定义一个验证方法，获取到验证集的精准率和loss
def validate():
    model.eval()
    total_loss = 0.
    total_correct = 0
    for inputs, targets in validation_loader:
        inputs, targets = to_device(inputs), targets.to(device)
        outputs = model(inputs)
        loss = criteria(outputs.view(-1), targets.float())
        total_loss += float(loss)

        correct_num = (((outputs >= 0.5).float() * 1).flatten() == targets).sum()
        total_correct += correct_num

    return total_correct / len(validation_dataset), total_loss / len(validation_dataset)

In [None]:
# 首先将模型调成训练模式
model.train()

# 清空一下cuda缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# 定义几个变量，帮助打印loss
total_loss = 0.
# 记录步数
step = 0

# 记录在验证集上最好的准确率
best_accuracy = 0

# 开始训练
for epoch in range(epochs):
    model.train()
    for i, (inputs, targets) in enumerate(train_loader):
        # 从batch中拿到训练数据
        inputs, targets = to_device(inputs), targets.to(device)
        # 传入模型进行前向传递
        outputs = model(inputs)
        # 计算损失
        loss = criteria(outputs.view(-1), targets.float())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += float(loss)
        step += 1

        if step % log_per_step == 0:
            print("Epoch {}/{}, Step: {}/{}, total loss:{:.4f}".format(epoch+1, epochs, i, len(train_loader), total_loss))
            total_loss = 0

        del inputs, targets

    # 一个epoch后，使用过验证集进行验证
    accuracy, validation_loss = validate()
    print("Epoch {}, accuracy: {:.4f}, validation loss: {:.4f}".format(epoch+1, accuracy, validation_loss))
    # torch.save(model, model_dir / f"model_{epoch}.pt")

    # 保存最好的模型
    if accuracy > best_accuracy:
        torch.save(model, model_dir / f"model_best.pt")
        best_accuracy = accuracy

Epoch 1/20, Step: 49/334, total loss:28.7035
Epoch 1/20, Step: 99/334, total loss:17.0512
Epoch 1/20, Step: 149/334, total loss:9.9987
Epoch 1/20, Step: 199/334, total loss:7.4437
Epoch 1/20, Step: 249/334, total loss:5.9816
Epoch 1/20, Step: 299/334, total loss:4.4325
Epoch 1, accuracy: 0.9758, validation loss: 0.0047
Epoch 2/20, Step: 15/334, total loss:3.9038
Epoch 2/20, Step: 65/334, total loss:2.7194
Epoch 2/20, Step: 115/334, total loss:4.1560
Epoch 2/20, Step: 165/334, total loss:2.8484
Epoch 2/20, Step: 215/334, total loss:2.6513
Epoch 2/20, Step: 265/334, total loss:1.5685
Epoch 2/20, Step: 315/334, total loss:2.0519
Epoch 2, accuracy: 0.9924, validation loss: 0.0021
Epoch 3/20, Step: 31/334, total loss:1.6185
Epoch 3/20, Step: 81/334, total loss:1.7487
Epoch 3/20, Step: 131/334, total loss:1.7981
Epoch 3/20, Step: 181/334, total loss:2.0413
Epoch 3/20, Step: 231/334, total loss:1.4561
Epoch 3/20, Step: 281/334, total loss:1.5774
Epoch 3/20, Step: 331/334, total loss:3.1136
Ep

In [None]:
#加载最好的模型，然后进行测试集的预测
model = torch.load(model_dir / f"model_best.pt")
model = model.eval()

In [None]:
test_dataset = MyDataset('test')
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
results = []
for inputs, ids in test_loader:
    outputs = model(inputs.to(device))
    outputs = (outputs >= 0.5).int().flatten().tolist()
    ids = ids.tolist()
    results = results + [(id, result) for result, id in zip(outputs, ids)]

In [None]:
test_label = [pair[1] for pair in results]
test_data['label'] = test_label
test_data[['uuid', 'label']].to_csv('/content/drive/MyDrive/Colab Notebooks/submit_task1.csv', index=None)

In [None]:
%pip install sentence-transformers




In [None]:
# 导入pandas用于读取表格数据
import pandas as pd

# 导入BOW（词袋模型），可以选择将CountVectorizer替换为TfidfVectorizer（TF-IDF（词频-逆文档频率）），注意上下文要同时修改，亲测后者效果更佳
from sklearn.feature_extraction.text import TfidfVectorizer
# 导入Bert模型
from sentence_transformers import SentenceTransformer

# 导入计算相似度前置库，为了计算候选者和文档之间的相似度，我们将使用向量之间的余弦相似度，因为它在高维度下表现得相当好。
from sklearn.metrics.pairwise import cosine_similarity

# 过滤警告消息
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)


In [None]:
# # 读取数据集
# test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data2/testB.csv')

test = test_data
test['title'] = test['title'].fillna('')
test['abstract'] = test['abstract'].fillna('')

test['text'] = test['title'].fillna('') + ' ' +test['abstract'].fillna('')

In [None]:
test_data

Unnamed: 0,uuid,title,author,abstract,text,label
0,0,Tobacco Consumption and High-Sensitivity Cardi...,"Julia Brox Skranes,Magnus Nakrem Lyngbakken,Kr...",Background Cardiac troponins represent a sensi...,Tobacco Consumption and High-Sensitivity Cardi...,1
1,1,Approaching towards sustainable supply chain u...,"Mohammad Reza Seddigh,Sajjad Shokouhyar,Fateme...",These two main objectives of this study are to...,Approaching towards sustainable supply chain u...,1
2,2,Does globalization matter for ecological footp...,"Kirikkaleli, Dervis; Adebayo, Tomiwa Sunday; K...",The main aim of this paper is to explore the r...,Does globalization matter for ecological footp...,1
3,3,Myths and Misconceptions About University Stud...,"Megan Paull,Kirsten Holmes,Maryam Omari,Debbie...",This paper examines myths and misconceptions a...,Myths and Misconceptions About University Stud...,1
4,4,Antioxidant Status of Rat Liver Mitochondria u...,"S I Khizrieva,R A Khalilov,A M Dzhafarova,V R ...",For evaluation of the contribution of the anti...,Antioxidant Status of Rat Liver Mitochondria u...,1
...,...,...,...,...,...,...
1995,1995,The treatment of veterinary antibiotics in swi...,"Qian, Mengcheng; Yang, Linyan; Chen, Xingkui; ...",Elevated concentrations and potential toxiciti...,The treatment of veterinary antibiotics in swi...,1
1996,1996,Socio-political efficacy explains increase in ...,"Taciano L Milfont,Danny Osborne,Chris G Sibley...",The ongoing COVID-19 pandemic claimed millions...,Socio-political efficacy explains increase in ...,1
1997,1997,Investigation of early puberty prevalence and ...,"Esin Gizem Olgun,Sirmen Kizilcan Cetin,Zeynep ...",We aimed to determine the prevalence of early ...,Investigation of early puberty prevalence and ...,1
1998,1998,From 3D printing to 3D bioprinting: the materi...,"Nihal Engin Vrana,Sharda Gupta,Kunal Mitra,Alb...",The application of 3D printing technologies fi...,From 3D printing to 3D bioprinting: the materi...,1


In [None]:
# 定义停用词，去掉出现较多，但对文章不关键的词语
stops_ =[i.strip() for i in open(r'/content/drive/MyDrive/Colab Notebooks/stop.txt',encoding='utf-8').readlines()]
stops_

["'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'t",
 "'ve",
 'ZT',
 'ZZ',
 'a',
 "a's",
 'able',
 'about',
 'above',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'added',
 'adj',
 'adopted',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'again',
 'against',
 'ah',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',
 'are',
 'area',
 'areas',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'as',
 'aside',
 'ask',
 'asked',
 'asking',
 'asks',
 'associated',
 'at',
 'auth',
 'available',
 'away',
 'awfully',
 'b',
 'back',
 'backed',
 'backing',
 'backs',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming

In [None]:

import nltk

# Set the NLTK data path
nltk.data.path.append('/content/sample_data/nltk_data/corpora/stopwords')

import nltk
from nltk.corpus import stopwords

# 下载停用词数据（如果你还没有下载过）
nltk.download('all')
# 定义停用词，去掉出现较多，但对文章不关键的词语
stops__ =  set(stopwords.words('english'))


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

In [None]:
stops = list(stops__)+ stops_
len(set(stops))

950

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# 引入分词器
from nltk import word_tokenize, ngrams
# 定义方法按照词频筛选关键词

def extract_keywords_by_freq(title, abstract):
    ngrams_count = list(ngrams(word_tokenize(title.lower()), 2)) + list(ngrams(word_tokenize(abstract.lower()), 2))
    ngrams_count = pd.DataFrame(ngrams_count)
    ngrams_count = ngrams_count[~ngrams_count[0].isin(stops)]
    ngrams_count = ngrams_count[~ngrams_count[1].isin(stops)]
    ngrams_count = ngrams_count[ngrams_count[0].apply(len) > 3]
    ngrams_count = ngrams_count[ngrams_count[1].apply(len) > 3]
    # Assuming ngrams_count is your DataFrame with two columns: 0 and 1
    ngrams_count['phrase'] = ngrams_count.loc[:, 0] + ' ' + ngrams_count.loc[:, 1]

    ngrams_count = ngrams_count['phrase'].value_counts()
    ngrams_count = ngrams_count[ngrams_count > 1]
    return list(ngrams_count.index)[:5]

## 对测试集提取关键词

test_words = []
for row in test.iterrows():
    # 读取第每一行数据的标题与摘要并提取关键词
    prediction_keywords = extract_keywords_by_freq(row[1].title, row[1].abstract)
    # 利用文章标题进一步提取关键词
    prediction_keywords = [x.title() for x in prediction_keywords]
    # 如果未能提取到关键词
    if len(prediction_keywords) == 0:
        prediction_keywords = ['A', 'B']
    test_words.append('; '.join(prediction_keywords))

# test['Keywords'] = test_words
# test[['uuid', 'Keywords', 'label']].to_csv('submit_task2.csv', index=None)

In [None]:
test['Keywords'] = test_words
test[['uuid', 'Keywords', 'label']].to_csv('/content/drive/MyDrive/Colab Notebooks/submit_task2.csv', index=None)

In [None]:
!git clone https://github.com/KMnO4-zx/huanhuan-chat.git

Cloning into 'huanhuan-chat'...
remote: Enumerating objects: 1778, done.[K
remote: Counting objects: 100% (241/241), done.[K
remote: Compressing objects: 100% (165/165), done.[K
remote: Total 1778 (delta 86), reused 215 (delta 71), pack-reused 1537[K
Receiving objects: 100% (1778/1778), 176.34 MiB | 16.71 MiB/s, done.
Resolving deltas: 100% (91/91), done.
Updating files: 100% (1564/1564), done.


In [None]:
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data1/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data2/testB.csv')

In [None]:
res = []

for i in range(len(train_df)):
    paper_item = train_df.loc[i]
    tmp = {
    "instruction": "Please judge whether it is a medical field paper according to the given paper title and abstract, output 1 or 0, the following is the paper title, author and abstract -->",
    "input": f"title:{paper_item[1]},abstract:{paper_item[3]}",
    "output": str(paper_item[5])
  }
    res.append(tmp)

import json

with open('paper_label.json', mode='w', encoding='utf-8') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)

In [None]:
!cd ./huanhuan-chat&&pip install -r requirements.txt

Collecting transformers>=4.27.4 (from -r requirements.txt (line 2))
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.10.0 (from -r requirements.txt (line 3))
  Downloading datasets-2.14.0-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate>=0.19.0 (from -r requirements.txt (line 4))
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft>=0.3.0 (from -r requirements.txt (line 5))
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl

In [None]:
!git clone https://huggingface.co/THUDM/chatglm2-6b

Cloning into 'chatglm2-6b'...
remote: Enumerating objects: 167, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 167 (delta 41), reused 15 (delta 15), pack-reused 101[K
Receiving objects: 100% (167/167), 1.94 MiB | 24.18 MiB/s, done.
Resolving deltas: 100% (82/82), done.
Filtering content: 100% (8/8), 11.63 GiB | 38.02 MiB/s, done.


In [None]:
!cd ./huanhuan-chat&&sh xfg_train.sh

Traceback (most recent call last):
  File "/content/huanhuan-chat/src/train_bash.py", line 21, in <module>
    main()
  File "/content/huanhuan-chat/src/train_bash.py", line 6, in main
    model_args, data_args, training_args, finetuning_args, general_args = get_train_args()
  File "/content/huanhuan-chat/src/pet/core/parse.py", line 35, in get_train_args
    model_args, data_args, training_args, finetuning_args, general_args = parser.parse_args_into_dataclasses()
  File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses
    obj = dtype(**inputs)
  File "<string>", line 117, in __init__
  File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1376, in __post_init__
    raise ValueError(
ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA devices.
