In [1]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# log in to the huggingface-cli
!huggingface-cli login

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
# data downloaded from: https://github.com/wb14123/couplet-dataset

train_input = 'drive/My Drive/couplets/in.txt'
train_output = 'drive/My Drive/couplets/out.txt'
test_input = 'drive/My Drive/couplets/test_in.txt'
test_output = 'drive/My Drive/couplets/test_out.txt'

In [6]:
def read_file(file_path):
    f = open(file_path, 'rb')
    data = []
    for input_line in f:
        line = input_line.decode('utf-8').split()
        data.append("".join(line))
    f.close()

    return pd.DataFrame(data)

In [7]:
train_input_df = read_file(train_input)
train_output_df = read_file(train_output)
test_input_df = read_file(test_input)
test_output_df = read_file(test_output)

In [8]:
train_df = pd.concat([train_input_df, train_output_df], axis=1)
test_df = pd.concat([test_input_df, test_output_df], axis=1)
train_df.columns = ['input', 'output']
test_df.columns = ['input', 'output']

In [9]:
train_df.head()

Unnamed: 0,input,output
0,晚风摇树树还挺,晨露润花花更红
1,愿景天成无墨迹,万方乐奏有于阗
2,丹枫江冷人初去,绿柳堤新燕复来
3,忽忽几晨昏，离别间之，疾病间之，不及终年同静好,茕茕小儿女，孱羸若此，娇憨若此，更烦二老费精神
4,闲来野钓人稀处,兴起高歌酒醉中


In [10]:
test_df.tail()

Unnamed: 0,input,output
3995,入迷途，吞苦果，回头是岸,到此处，改前非，革面做人
3996,地近秦淮，看碧水蓝天，一行白鹭飞来何处,门临闹市，入红楼翠馆，四海旅人宾至如归
3997,水流知入海,树古自参天
3998,其巧在古倕以上,所居介帝君之间
3999,万众齐心，已膺全国文明市,千帆竞发，再鼓鹭江经济潮


In [11]:
COUPLET_SYSTEM_PROMPT = """你是一个写对联的AI助手，用户会给出上联，需要你来写出下联。请注意上联和下联的字数要一致，并且平仄和意境都要一一对应。"""

TEMPLATE = (
    "<s>[INST] <<SYS>> {system_prompt} <</SYS>> {input} [/INST] {output} </s>"
)

In [12]:
def format_row(row):
    return TEMPLATE.format_map({
        'system_prompt': COUPLET_SYSTEM_PROMPT,
        'input': row['input'],
        'output': row['output']
    })

train_df['text'] = train_df.apply(format_row, axis=1)
test_df['text'] = test_df.apply(format_row, axis=1)

In [14]:
train_df.head()

Unnamed: 0,input,output,text
0,晚风摇树树还挺,晨露润花花更红,<s>[INST] <<SYS>> 你是一个写对联的AI助手，用户会给出上联，需要你来写出下...
1,愿景天成无墨迹,万方乐奏有于阗,<s>[INST] <<SYS>> 你是一个写对联的AI助手，用户会给出上联，需要你来写出下...
2,丹枫江冷人初去,绿柳堤新燕复来,<s>[INST] <<SYS>> 你是一个写对联的AI助手，用户会给出上联，需要你来写出下...
3,忽忽几晨昏，离别间之，疾病间之，不及终年同静好,茕茕小儿女，孱羸若此，娇憨若此，更烦二老费精神,<s>[INST] <<SYS>> 你是一个写对联的AI助手，用户会给出上联，需要你来写出下...
4,闲来野钓人稀处,兴起高歌酒醉中,<s>[INST] <<SYS>> 你是一个写对联的AI助手，用户会给出上联，需要你来写出下...


In [15]:
ds = DatasetDict()
ds['train'] = Dataset.from_pandas(train_df)
ds['validation'] = Dataset.from_pandas(test_df)

In [16]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'text'],
        num_rows: 770491
    })
    validation: Dataset({
        features: ['input', 'output', 'text'],
        num_rows: 4000
    })
})


In [17]:
ds.push_to_hub("llama2-chinese-couplet-770k")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/771 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

## 1000 sample

In [18]:
sample_ds = DatasetDict()
sample_ds['train'] = ds['train'].shuffle(seed=42).select(range(1000))

In [19]:
sample_ds.push_to_hub("llama2-chinese-couplet-1k")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/471 [00:00<?, ?B/s]

## 100k sample

In [20]:
sample_ds = DatasetDict()
sample_ds['train'] = ds['train'].shuffle(seed=42).select(range(100000))
sample_ds['validation'] = ds['validation']

In [21]:
sample_ds.push_to_hub("llama2-chinese-couplet-100k")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/596 [00:00<?, ?B/s]