In [3]:
import os
import numpy as np
from functools import partial
import pandas as pd
import random

In [7]:
"""
A ## year old ##(female/male), who has ## children and lives in ##. 
## She/he ## smokes/does not smoke, and has body mass index  ##. 
# How much does this person pay for medical insurance? 
"""
def data2text(row, label = True, feature_name = False, columns = None):
    prompt = "When we have " 
    for i in range(1,len(row)-label):
        if row[i] == 'female':
            row_i = 1
        elif row[i] == 'male':
            row_i = -1
        elif row[i] == 'yes':
            row_i = 1
        elif row[i] == 'no':
            row_i = -1
        elif row[i] == 'southeast':
            row_i = 1
        elif row[i] == 'northwest':
            row_i = 2
        elif row[i] == 'southwest':
            row_i = 3
        elif row[i] == 'northeast':
            row_i = 4
        else:
            row_i = row[i]
        if feature_name:
            prompt += "%s=%s, " % (columns[i], row_i) 
        else:
            prompt += "x%d=%.4f, " % (i, row_i) 
    prompt += "what should be the y value?"
    if not label:
        return "%s###" % prompt
    else:
        completion = "%.4f" % row['charges']
        return "{\"prompt\":\"%s###\", \"completion\":\"%s@@@\"}" % (prompt, completion)

def df2jsonl(df, filename, feature_name = False, columns = None):
    jsonl = '\n'.join(df.apply(func = partial(data2text, feature_name = feature_name, columns = columns), axis = 1).tolist())
    with open(os.path.join(filename), 'w') as f:
        f.write(jsonl)

def data2text_feature_name(row, integer = False, label = True, shuffle = False):
    if shuffle:
        prompt = "A " 
        # age, gender
        prompt += "%s-year-old %s, who has %.4f children, lives in the %d in the U.S." % (row['sex'],row['region'],row['bmi'],row['age'])
        if row['sex'] == 'female':
            prompt += " She"
        else:
            prompt += " He"
        if row['smoker'] == 'no':
            prompt += " does not smoke,"
        else:
            prompt += " smokes,"
        prompt += " and has body mass index %d. How much does this person pay for medical insurance?" %(row['children'])
    else:
        prompt = "A " 
        # age, gender
        prompt += "%d-year-old %s, who has %d children, lives in the %s in the U.S." % (row['age'],row['sex'],row['children'],row['region'])
        if row['sex'] == 'female':
            prompt += " She"
        else:
            prompt += " He"
        if row['smoker'] == 'no':
            prompt += " does not smoke,"
        else:
            prompt += " smokes,"
        prompt += " and has body mass index %.4f. How much does this person pay for medical insurance?" %(row['bmi'])
    completion = "%.4f" % row['charges']
    return "{\"prompt\":\"%s###\", \"completion\":\"%s@@@\"}" % (prompt, completion)

def df2jsonl_feature_name(df, filename, integer = False, shuffle = False):
    jsonl = '\n'.join(df.apply(func = partial(data2text_feature_name, integer = integer, shuffle = shuffle), axis = 1).tolist())
    with open(os.path.join(filename), 'w') as f:
        f.write(jsonl)


In [8]:
# split the dataset - with feature names
data = pd.read_csv("./insurance.csv", sep=",")
n = len(data)
print("Number of total samples:",n)
idx = np.arange(n)
random.shuffle(idx)
num_training = int(.7*n)
print("Number of total training samples:",num_training)

train_idx, valid_idx, test_idx = idx[:int(.7*n)], idx[int(.7*n):int(.85*n)], idx[int(.85*n):]
train_idx_20 = train_idx[:int(0.2*num_training)]
print("Number of 20% training samples:",len(train_idx_20))
train_idx_40 = train_idx[:int(0.4*num_training)]
print("Number of 40% training samples:",len(train_idx_40))
train_idx_60 = train_idx[:int(0.6*num_training)]
print("Number of 60%  training samples:",len(train_idx_60))
train_idx_80 = train_idx[:int(0.8*num_training)]
print("Number of 80%  training samples:",len(train_idx_80))

print("Number of validation samples:",len(valid_idx))
print("Number of testing samples:",len(test_idx))

data.loc[train_idx].to_csv("./insurance_train_full.csv", sep=",")
data.loc[train_idx_20].to_csv("./insurance_train_20.csv", sep=",")
data.loc[train_idx_40].to_csv("./insurance_train_40.csv", sep=",")
data.loc[train_idx_60].to_csv("./insurance_train_60.csv", sep=",")
data.loc[train_idx_80].to_csv("./insurance_train_80.csv", sep=",")
data.loc[test_idx].to_csv("./insurance_test.csv", sep=",")
data.loc[valid_idx].to_csv("./insurance_valid.csv", sep=",")

Number of total samples: 1338
Number of total training samples: 936
Number of 20% training samples: 187
Number of 40% training samples: 374
Number of 60%  training samples: 561
Number of 80%  training samples: 748
Number of validation samples: 201
Number of testing samples: 201


In [9]:
# prompts with feature names
train_data = pd.read_csv("./insurance_train_full.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_fn_full_train.jsonl")

train_data = pd.read_csv("./insurance_train_20.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_fn_20_train.jsonl")

train_data = pd.read_csv("./insurance_train_40.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_fn_40_train.jsonl")

train_data = pd.read_csv("./insurance_train_60.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_fn_60_train.jsonl")

train_data = pd.read_csv("./insurance_train_80.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_fn_80_train.jsonl")

test_data = pd.read_csv("./insurance_test.csv", sep=",")
df2jsonl_feature_name(test_data,"insurance_fn_test.jsonl")

valid_data = pd.read_csv("./insurance_valid.csv", sep=",")
df2jsonl_feature_name(valid_data,"insurance_fn_valid.jsonl")

In [8]:
# prompts with shuffled feature names
train_data = pd.read_csv("./insurance_train_full.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_rd_full_train.jsonl",shuffle=True)

train_data = pd.read_csv("./insurance_train_20.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_rd_20_train.jsonl", shuffle=True)

train_data = pd.read_csv("./insurance_train_40.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_rd_40_train.jsonl", shuffle=True)

train_data = pd.read_csv("./insurance_train_60.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_rd_60_train.jsonl", shuffle=True)

train_data = pd.read_csv("./insurance_train_80.csv", sep=",")
df2jsonl_feature_name(train_data,"insurance_rd_80_train.jsonl", shuffle=True)

test_data = pd.read_csv("./insurance_test.csv", sep=",")
df2jsonl_feature_name(test_data,"insurance_rd_test.jsonl", shuffle=True)

valid_data = pd.read_csv("./insurance_valid.csv", sep=",")
df2jsonl_feature_name(valid_data,"insurance_rd_valid.jsonl", shuffle=True)

In [10]:
# prompts with = type of feature names
prefix = '=fn='
train_data = pd.read_csv("./insurance_train_full.csv", sep=",")
df2jsonl(train_data,"insurance%sfull_train.jsonl" % prefix, feature_name = True, columns = data.columns)

train_data = pd.read_csv("./insurance_train_20.csv", sep=",")
df2jsonl(train_data,"insurance%s20_train.jsonl" % prefix, feature_name = True, columns = data.columns)

train_data = pd.read_csv("./insurance_train_40.csv", sep=",")
df2jsonl(train_data,"insurance%s40_train.jsonl" % prefix, feature_name = True, columns = data.columns)

train_data = pd.read_csv("./insurance_train_60.csv", sep=",")
df2jsonl(train_data,"insurance%s60_train.jsonl" % prefix, feature_name = True, columns = data.columns)

train_data = pd.read_csv("./insurance_train_80.csv", sep=",")
df2jsonl(train_data,"insurance%s80_train.jsonl" % prefix, feature_name = True, columns = data.columns)

test_data = pd.read_csv("./insurance_test.csv", sep=",")
df2jsonl(test_data,"insurance%stest.jsonl" % prefix, feature_name = True, columns = data.columns)

valid_data = pd.read_csv("./insurance_valid.csv", sep=",")
df2jsonl(valid_data,"insurance%svalid.jsonl" % prefix, feature_name = True, columns = data.columns)

In [11]:
# prompts with = random feature names
prefix = '=rd='
random_columns = data.columns.tolist()[:-1]
random.seed(123)
random.shuffle(random_columns)
random_columns.append('charges')
train_data = pd.read_csv("./insurance_train_full.csv", sep=",")
df2jsonl(train_data,"insurance%sfull_train.jsonl" % prefix, feature_name = True, columns = random_columns)

train_data = pd.read_csv("./insurance_train_20.csv", sep=",")
df2jsonl(train_data,"insurance%s20_train.jsonl" % prefix, feature_name = True, columns = random_columns)

train_data = pd.read_csv("./insurance_train_40.csv", sep=",")
df2jsonl(train_data,"insurance%s40_train.jsonl" % prefix, feature_name = True, columns = random_columns)

train_data = pd.read_csv("./insurance_train_60.csv", sep=",")
df2jsonl(train_data,"insurance%s60_train.jsonl" % prefix, feature_name = True, columns = random_columns)

train_data = pd.read_csv("./insurance_train_80.csv", sep=",")
df2jsonl(train_data,"insurance%s80_train.jsonl" % prefix, feature_name = True, columns = random_columns)

test_data = pd.read_csv("./insurance_test.csv", sep=",")
df2jsonl(test_data,"insurance%stest.jsonl" % prefix, feature_name = True, columns = random_columns)

valid_data = pd.read_csv("./insurance_valid.csv", sep=",")
df2jsonl(valid_data,"insurance%svalid.jsonl" % prefix, feature_name = True, columns = random_columns)

In [12]:
# prompts without feature names
train_data = pd.read_csv("./insurance_train_full.csv", sep=",")
df2jsonl(train_data,"insurance_full_train.jsonl")

train_data = pd.read_csv("./insurance_train_20.csv", sep=",")
df2jsonl(train_data,"insurance_20_train.jsonl")

train_data = pd.read_csv("./insurance_train_40.csv", sep=",")
df2jsonl(train_data,"insurance_40_train.jsonl")

train_data = pd.read_csv("./insurance_train_60.csv", sep=",")
df2jsonl(train_data,"insurance_60_train.jsonl")

train_data = pd.read_csv("./insurance_train_80.csv", sep=",")
df2jsonl(train_data,"insurance_80_train.jsonl")

test_data = pd.read_csv("./insurance_test.csv", sep=",")
df2jsonl(test_data,"insurance_test.jsonl")

valid_data = pd.read_csv("./insurance_valid.csv", sep=",")
df2jsonl(valid_data,"insurance_valid.jsonl")

In [13]:
# convert to numerical data
train_data = pd.read_csv("./insurance_train_full.csv", sep=",")
train_data_num = train_data.copy()
train_data_num['sex'] = train_data['sex'].apply(lambda x: 1 if x == 'female' else 'False')
train_data_num.loc[train_data['sex'] == 'female', 'sex'] = 1
train_data_num.loc[train_data['sex'] == 'male', 'sex'] = -1
train_data_num.loc[train_data['smoker'] == 'yes', 'smoker'] = 1
train_data_num.loc[train_data['smoker'] == 'no', 'smoker'] = -1
train_data_num.loc[train_data['region'] == 'southeast', 'region'] = 1
train_data_num.loc[train_data['region'] == 'northwest', 'region'] = 2
train_data_num.loc[train_data['region'] == 'southwest', 'region'] = 3
train_data_num.loc[train_data['region'] == 'northeast', 'region'] = 4
train_data_num.to_csv("./insurance_train_full_num.csv", sep=",")


train_data = pd.read_csv("./insurance_train_20.csv", sep=",")
train_data_num = train_data.copy()
train_data_num['sex'] = train_data['sex'].apply(lambda x: 1 if x == 'female' else 'False')
train_data_num.loc[train_data['sex'] == 'female', 'sex'] = 1
train_data_num.loc[train_data['sex'] == 'male', 'sex'] = -1
train_data_num.loc[train_data['smoker'] == 'yes', 'smoker'] = 1
train_data_num.loc[train_data['smoker'] == 'no', 'smoker'] = -1
train_data_num.loc[train_data['region'] == 'southeast', 'region'] = 1
train_data_num.loc[train_data['region'] == 'northwest', 'region'] = 2
train_data_num.loc[train_data['region'] == 'southwest', 'region'] = 3
train_data_num.loc[train_data['region'] == 'northeast', 'region'] = 4
train_data_num.to_csv("./insurance_train_20_num.csv", sep=",")

train_data = pd.read_csv("./insurance_train_40.csv", sep=",")
train_data_num = train_data.copy()
train_data_num['sex'] = train_data['sex'].apply(lambda x: 1 if x == 'female' else 'False')
train_data_num.loc[train_data['sex'] == 'female', 'sex'] = 1
train_data_num.loc[train_data['sex'] == 'male', 'sex'] = -1
train_data_num.loc[train_data['smoker'] == 'yes', 'smoker'] = 1
train_data_num.loc[train_data['smoker'] == 'no', 'smoker'] = -1
train_data_num.loc[train_data['region'] == 'southeast', 'region'] = 1
train_data_num.loc[train_data['region'] == 'northwest', 'region'] = 2
train_data_num.loc[train_data['region'] == 'southwest', 'region'] = 3
train_data_num.loc[train_data['region'] == 'northeast', 'region'] = 4
train_data_num.to_csv("./insurance_train_40_num.csv", sep=",")

train_data = pd.read_csv("./insurance_train_60.csv", sep=",")
train_data_num = train_data.copy()
train_data_num['sex'] = train_data['sex'].apply(lambda x: 1 if x == 'female' else 'False')
train_data_num.loc[train_data['sex'] == 'female', 'sex'] = 1
train_data_num.loc[train_data['sex'] == 'male', 'sex'] = -1
train_data_num.loc[train_data['smoker'] == 'yes', 'smoker'] = 1
train_data_num.loc[train_data['smoker'] == 'no', 'smoker'] = -1
train_data_num.loc[train_data['region'] == 'southeast', 'region'] = 1
train_data_num.loc[train_data['region'] == 'northwest', 'region'] = 2
train_data_num.loc[train_data['region'] == 'southwest', 'region'] = 3
train_data_num.loc[train_data['region'] == 'northeast', 'region'] = 4
train_data_num.to_csv("./insurance_train_60_num.csv", sep=",")

train_data = pd.read_csv("./insurance_train_80.csv", sep=",")
train_data_num = train_data.copy()
train_data_num['sex'] = train_data['sex'].apply(lambda x: 1 if x == 'female' else 'False')
train_data_num.loc[train_data['sex'] == 'female', 'sex'] = 1
train_data_num.loc[train_data['sex'] == 'male', 'sex'] = -1
train_data_num.loc[train_data['smoker'] == 'yes', 'smoker'] = 1
train_data_num.loc[train_data['smoker'] == 'no', 'smoker'] = -1
train_data_num.loc[train_data['region'] == 'southeast', 'region'] = 1
train_data_num.loc[train_data['region'] == 'northwest', 'region'] = 2
train_data_num.loc[train_data['region'] == 'southwest', 'region'] = 3
train_data_num.loc[train_data['region'] == 'northeast', 'region'] = 4
train_data_num.to_csv("./insurance_train_80_num.csv", sep=",")

test_data = pd.read_csv("./insurance_test.csv", sep=",")
train_data_num = train_data.copy()
train_data_num['sex'] = train_data['sex'].apply(lambda x: 1 if x == 'female' else 'False')
train_data_num.loc[train_data['sex'] == 'female', 'sex'] = 1
train_data_num.loc[train_data['sex'] == 'male', 'sex'] = -1
train_data_num.loc[train_data['smoker'] == 'yes', 'smoker'] = 1
train_data_num.loc[train_data['smoker'] == 'no', 'smoker'] = -1
train_data_num.loc[train_data['region'] == 'southeast', 'region'] = 1
train_data_num.loc[train_data['region'] == 'northwest', 'region'] = 2
train_data_num.loc[train_data['region'] == 'southwest', 'region'] = 3
train_data_num.loc[train_data['region'] == 'northeast', 'region'] = 4
train_data_num.to_csv("./insurance_test_num.csv", sep=",")

valid_data = pd.read_csv("./insurance_valid.csv", sep=",")
train_data_num = train_data.copy()
train_data_num['sex'] = train_data['sex'].apply(lambda x: 1 if x == 'female' else 'False')
train_data_num.loc[train_data['sex'] == 'female', 'sex'] = 1
train_data_num.loc[train_data['sex'] == 'male', 'sex'] = -1
train_data_num.loc[train_data['smoker'] == 'yes', 'smoker'] = 1
train_data_num.loc[train_data['smoker'] == 'no', 'smoker'] = -1
train_data_num.loc[train_data['region'] == 'southeast', 'region'] = 1
train_data_num.loc[train_data['region'] == 'northwest', 'region'] = 2
train_data_num.loc[train_data['region'] == 'southwest', 'region'] = 3
train_data_num.loc[train_data['region'] == 'northeast', 'region'] = 4
train_data_num.to_csv("./insurance_valid_num.csv", sep=",")

In [14]:
test_data = pd.read_csv("./insurance_test.csv", sep=",")
test_data_num = test_data.copy()
test_data_num['sex'] = test_data['sex'].apply(lambda x: 1 if x == 'female' else 'False')
test_data_num.loc[test_data['sex'] == 'female', 'sex'] = 1
test_data_num.loc[test_data['sex'] == 'male', 'sex'] = -1
test_data_num.loc[test_data['smoker'] == 'yes', 'smoker'] = 1
test_data_num.loc[test_data['smoker'] == 'no', 'smoker'] = -1
test_data_num.loc[test_data['region'] == 'southeast', 'region'] = 1
test_data_num.loc[test_data['region'] == 'northwest', 'region'] = 2
test_data_num.loc[test_data['region'] == 'southwest', 'region'] = 3
test_data_num.loc[test_data['region'] == 'northeast', 'region'] = 4
test_data_num.to_csv("./insurance_test_num.csv", sep=",")