In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
import datasets
import json
import random


def format_review(df):
    df = df[['text', 'airline_sentiment']].copy()
    df.rename(columns={'text': 'input', 'airline_sentiment': 'output'}, inplace=True)
    df.loc[:, 'output'] = df['output'].replace({
        'negative': 'A. negative',
        'neutral': 'B. neutral',
        'positive': 'C. positive'
    })
    df.loc[:, 'instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {A. negative; B. neutral; C. positive}.'
    df.loc[:, 'history'] = pd.Series([[] for _ in range(len(df))], index=df.index)
    df = df[['instruction', 'input', 'output', 'history']]
    return df

### Load review data


In [43]:
max_sample = 1000

In [44]:
csv_file_path = '../data/flight_review/Tweets.csv'
df = pd.read_csv(csv_file_path, quotechar='"', escapechar='\\', engine='python', on_bad_lines='skip')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


train_df = format_review(train_df)[:max_sample]
test_df = format_review(test_df)

train_json_path = '../data/flight_review/train.json'
test_json_path = '../data/flight_review/test.json'
train_df.to_json(train_json_path, orient='records', force_ascii=False, lines=False, indent=2)
test_df.to_json(test_json_path, orient='records', force_ascii=False, lines=False, indent=2)

### Load insurance Q&A data


In [45]:
insurance_ds = datasets.load_dataset('Ddream-ai/InsuranceCorpus')
insurance_ds

DatasetDict({
    train: Dataset({
        features: ['咨询', '回复'],
        num_rows: 3599
    })
    validation: Dataset({
        features: ['咨询', '回复'],
        num_rows: 189
    })
})

In [46]:
max_sample = 3000

In [47]:
data_to_save = []
for i in range(max_sample):
    line = insurance_ds['train'][i]
    formatted_line = {
        "instruction": line['咨询'],
        "input": "",
        "output": line['回复'],
        "history": []
    }
    data_to_save.append(formatted_line)

insuranceQA = '../data/InsuranceCorpus.json'
with open(insuranceQA, 'w', encoding='utf-8') as f:
    json.dump(data_to_save, f, ensure_ascii=False, indent=2)

### Shuffle two datasets


In [48]:
with open(train_json_path, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open(insuranceQA, 'r', encoding='utf-8') as f:
    insuranceQA_data = json.load(f)

combined_data = train_data + insuranceQA_data
random.shuffle(combined_data)
dataset = pd.DataFrame(combined_data)

output_file_path = '../data/QA_sentiment.json'
dataset.to_json(output_file_path, orient='records', force_ascii=False, lines=True)