In [1]:
import os
import numpy as np
from functools import partial
import pandas as pd
import random

In [2]:
def data2text(row, integer = False, label = True):
    prompt = "When we have " 
    for i in range(1,len(row)-label):
        if integer:
            prompt += "x%d=%d, " % (i, row[i])
        else:
            prompt += "x%d=%.4f, " % (i, row[i]) 
#     print(prompt)
    prompt += "what should be the y value?"
    if not label:
        return "%s###" % prompt
    else:
        if integer:
            completion = "%d" % row['PE']
        else:
            completion = "%.4f" % row['PE']
        return "{\"prompt\":\"%s###\", \"completion\":\"%s@@@\"}" % (prompt, completion)

def df2jsonl(df, filename, integer = False):
    jsonl = '\n'.join(df.apply(func = partial(data2text, integer = integer), axis = 1).tolist())
    with open(os.path.join(filename), 'w') as f:
        f.write(jsonl)

In [3]:
"""
When the air temperature is ## celsius degree, 
ambient pressure is ## millibar,
relative humidity is ##%,
and exhaust vacuum is ## cm Hg,
what's the net hourly electrical energy output?

The net hourly electrical energy output is ##
"""
def data2text_feature_name(row, integer = False, label = True):
    prompt = "When the air temperature is %.4f celsius degree, " % row['AT']
    prompt += "ambient pressure is %.4f millibar, " % row['AP']
    prompt += "relative humidity is %.4f percent, " % row['RH']
    prompt += "and exhaust vacuum is %.4f cm Hg, " % row['V']
    prompt += "what's the net hourly electrical energy output in MW?"
    
    completion = "%.4f" % row['PE']
    return "{\"prompt\":\"%s###\", \"completion\":\"%s@@@\"}" % (prompt, completion)

def df2jsonl_feature_name(df, filename, integer = False):
    jsonl = '\n'.join(df.apply(func = partial(data2text_feature_name, integer = integer), axis = 1).tolist())
    with open(os.path.join(filename), 'w') as f:
        f.write(jsonl)

In [4]:
# split the dataset - with feature names
data = pd.read_csv("./ccpp_all.csv", sep=",")
n = len(data)
print("Number of total samples:",n)
idx = np.arange(n)
random.shuffle(idx)
num_training = int(.7*n)
print("Number of total training samples:",num_training)

train_idx, valid_idx, test_idx = idx[:int(.7*n)], idx[int(.7*n):int(.85*n)], idx[int(.85*n):]
train_idx_20 = train_idx[:int(0.2*num_training)]
print("Number of 20% training samples:",len(train_idx_20))
train_idx_40 = train_idx[:int(0.4*num_training)]
print("Number of 40% training samples:",len(train_idx_40))
train_idx_60 = train_idx[:int(0.6*num_training)]
print("Number of 60%  training samples:",len(train_idx_60))
train_idx_80 = train_idx[:int(0.6*num_training)]
print("Number of 80%  training samples:",len(train_idx_80))

print("Number of validation samples:",len(valid_idx))
print("Number of testing samples:",len(test_idx))

data.loc[train_idx].to_csv("./ccpp_train_full.csv", sep=",")
data.loc[train_idx_20].to_csv("./ccpp_train_20.csv", sep=",")
data.loc[train_idx_40].to_csv("./ccpp_train_40.csv", sep=",")
data.loc[train_idx_60].to_csv("./ccpp_train_60.csv", sep=",")
data.loc[train_idx_80].to_csv("./ccpp_train_80.csv", sep=",")
data.loc[test_idx].to_csv("./ccpp_test.csv", sep=",")
data.loc[valid_idx].to_csv("./ccpp_valid.csv", sep=",")

Number of total samples: 9568
Number of total training samples: 6697
Number of 20% training samples: 1339
Number of 40% training samples: 2678
Number of 60%  training samples: 4018
Number of 80%  training samples: 4018
Number of validation samples: 1435
Number of testing samples: 1436


In [5]:
# prompts with feature names
train_data = pd.read_csv("./ccpp_train_full.csv", sep=",")
df2jsonl_feature_name(train_data,"ccpp_fn_full_train.jsonl")

train_data = pd.read_csv("./ccpp_train_20.csv", sep=",")
df2jsonl_feature_name(train_data,"ccpp_fn_20_train.jsonl")

train_data = pd.read_csv("./ccpp_train_40.csv", sep=",")
df2jsonl_feature_name(train_data,"ccpp_fn_40_train.jsonl")

train_data = pd.read_csv("./ccpp_train_60.csv", sep=",")
df2jsonl_feature_name(train_data,"ccpp_fn_60_train.jsonl")

train_data = pd.read_csv("./ccpp_train_80.csv", sep=",")
df2jsonl_feature_name(train_data,"ccpp_fn_80_train.jsonl")

test_data = pd.read_csv("./ccpp_test.csv", sep=",")
df2jsonl_feature_name(test_data,"ccpp_fn_test.jsonl")

valid_data = pd.read_csv("./ccpp_valid.csv", sep=",")
df2jsonl_feature_name(valid_data,"ccpp_fn_valid.jsonl")

In [6]:
# prompts without feature names
train_data = pd.read_csv("./ccpp_train_full.csv", sep=",")
df2jsonl(train_data,"ccpp_full_train.jsonl")

train_data = pd.read_csv("./ccpp_train_20.csv", sep=",")
df2jsonl(train_data,"ccpp_20_train.jsonl")

train_data = pd.read_csv("./ccpp_train_40.csv", sep=",")
df2jsonl(train_data,"ccpp_40_train.jsonl")

train_data = pd.read_csv("./ccpp_train_60.csv", sep=",")
df2jsonl(train_data,"ccpp_60_train.jsonl")

train_data = pd.read_csv("./ccpp_train_80.csv", sep=",")
df2jsonl(train_data,"ccpp_80_train.jsonl")

test_data = pd.read_csv("./ccpp_test.csv", sep=",")
df2jsonl(test_data,"ccpp_test.jsonl")

valid_data = pd.read_csv("./ccpp_valid.csv", sep=",")
df2jsonl(valid_data,"ccpp_valid.jsonl")

In [7]:
# convert to numerical data
# Nothing to convert

In [8]:
# train_data = pd.read_csv("./ccpp_train.csv", sep=",")
# df2jsonl(train_data,"ccpp_train.jsonl")

# test_data = pd.read_csv("./ccpp_test.csv", sep=",")
# df2jsonl(test_data,"ccpp_test.jsonl")

# valid_data = pd.read_csv("./ccpp_valid.csv", sep=",")
# df2jsonl(valid_data,"ccpp_valid.jsonl")