In [22]:
import os
from datasets import Dataset

# Define your folder and instruction
data_folder = './13F 3.0/13F/all'
instruction = "Extract the data from the information table with these as headers:FileNumber,SecurityDescription,Class,CUSIP,Shares,MarketValue(in$1000),InvestmentDiscretion,VotingAuthority(Sole),VotingAuthority(Shared),VotingAuthority(None)"  # Example instruction
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN ="|EOS|"# Must add EOS_TOKEN
# EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
# Function to read the content of a text file

In [27]:
def read_text_file(file_path):
    with open(file_path, 'r') as file:
        return file.read().strip()

# Function to find the corresponding output file for an input file
def find_output_file(input_file, folder_path):
    base_name = os.path.splitext(input_file)[0]
    output_file_name = f"{base_name}_out.txt"  # Assuming output files end with '_out.txt'
    output_file_path = os.path.join(folder_path, output_file_name)
    return output_file_path if os.path.isfile(output_file_path) else None

# Function to load and format the dataset
def load_and_format_dataset(folder_path, instruction):
    inputs = []
    outputs = []
    
    # List all files in the directory
    files = sorted(os.listdir(folder_path))
    
    for file_name in files:
        if file_name.endswith('.txt') and not file_name.endswith('_out.txt'):
            input_file_path = os.path.join(folder_path, file_name)
            output_file_path = find_output_file(file_name, folder_path)
            print(input_file_path)
            print(output_file_path)
            if output_file_path and os.path.isfile(input_file_path):
                input_text = read_text_file(input_file_path)
                output_text = read_text_file(output_file_path)
                
                inputs.append(input_text)
                outputs.append(output_text)
    
    # Construct dataset
    return Dataset.from_dict({
        'instruction': [instruction] * len(inputs),
        'input': inputs,
        'output': outputs
    })

# Define the formatting function
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output_text in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text, output_text) + EOS_TOKEN
        texts.append(text)
    return { "text": texts }

# Load and format the dataset
dataset = load_and_format_dataset(data_folder, instruction)

# Apply the formatting function
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

# Print a sample to verify
print(formatted_dataset[0])

./13F 3.0/13F/all\one.txt
./13F 3.0/13F/all\one_out.txt
./13F 3.0/13F/all\two.txt
./13F 3.0/13F/all\two_out.txt


Map: 100%|██████████| 2/2 [00:00<00:00, 266.36 examples/s]

{'instruction': 'Extract the data from the information table with these as headers:FileNumber,SecurityDescription,Class,CUSIP,Shares,MarketValue(in$1000),InvestmentDiscretion,VotingAuthority(Sole),VotingAuthority(Shared),VotingAuthority(None)', 'input': '<DOCUMENT>\n<TYPE>13F-HR\n<SEQUENCE>1\n<FILENAME>bkd2q09.txt\n<DESCRIPTION>BDK WEALTH ADVISORS LLC\n<TEXT>\n                                  UNITED STATES\n                       SECURITIES AND EXCHANGE COMMISSION\n                            Washington, D.C.  20549\n\n                                    Form 13F\n\n                              Form 13F COVER PAGE\n\nReport for the Calendar Year or Quarter Ended: June 30, 2009\n\nCheck here if Amendment [  ]; Amendment Number:\nThis Amendment (Check only one.): [  ] is a restatement.\n                                  [  ] adds new holdings entries.\n\nInstitutional Investment Manager Filing this Report:\n\nName:    BKD Wealth Advisors, LLC\nAddress: 1700 Lincoln Street, Suite 1450\n




In [28]:
dataset[0]

{'instruction': 'Extract the data from the information table with these as headers:FileNumber,SecurityDescription,Class,CUSIP,Shares,MarketValue(in$1000),InvestmentDiscretion,VotingAuthority(Sole),VotingAuthority(Shared),VotingAuthority(None)',
 'input': '<DOCUMENT>\n<TYPE>13F-HR\n<SEQUENCE>1\n<FILENAME>bkd2q09.txt\n<DESCRIPTION>BDK WEALTH ADVISORS LLC\n<TEXT>\n                                  UNITED STATES\n                       SECURITIES AND EXCHANGE COMMISSION\n                            Washington, D.C.  20549\n\n                                    Form 13F\n\n                              Form 13F COVER PAGE\n\nReport for the Calendar Year or Quarter Ended: June 30, 2009\n\nCheck here if Amendment [  ]; Amendment Number:\nThis Amendment (Check only one.): [  ] is a restatement.\n                                  [  ] adds new holdings entries.\n\nInstitutional Investment Manager Filing this Report:\n\nName:    BKD Wealth Advisors, LLC\nAddress: 1700 Lincoln Street, Suite 1450\

In [29]:
dataset[1]

{'instruction': 'Extract the data from the information table with these as headers:FileNumber,SecurityDescription,Class,CUSIP,Shares,MarketValue(in$1000),InvestmentDiscretion,VotingAuthority(Sole),VotingAuthority(Shared),VotingAuthority(None)',
 'input': 'f<DOCUMENT>\n<TYPE>13F-HR\n<SEQUENCE>1\n<FILENAME>bkd2q09.txt\n<DESCRIPTION>BDK WEALTH ADVISORS LLC\n<TEXT>\n                                  UNITED STATES\n                       SECURITIES AND EXCHANGE COMMISSION\n                            Washington, D.C.  20549\n\n                                    Form 13F\n\n                              Form 13F COVER PAGE\n\nReport for the Calendar Year or Quarter Ended: June 30, 2009\n\nCheck here if Amendment [  ]; Amendment Number:\nThis Amendment (Check only one.): [  ] is a restatement.\n                                  [  ] adds new holdings entries.\n\nInstitutional Investment Manager Filing this Report:\n\nName:    BKD Wealth Advisors, LLC\nAddress: 1700 Lincoln Street, Suite 1450

In [16]:
print(formatted_dataset)
print(formatted_dataset[0]['text'])

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 2
})
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Extract the data from the information table with these as headers:FileNumber,SecurityDescription,Class,CUSIP,Shares,MarketValue(in$1000),InvestmentDiscretion,VotingAuthority(Sole),VotingAuthority(Shared),VotingAuthority(None)

### Input:
<DOCUMENT>
<TYPE>13F-HR
<SEQUENCE>1
<FILENAME>bkd2q09.txt
<DESCRIPTION>BDK WEALTH ADVISORS LLC
<TEXT>
                                  UNITED STATES
                       SECURITIES AND EXCHANGE COMMISSION
                            Washington, D.C.  20549

                                    Form 13F

                              Form 13F COVER PAGE

Report for the Calendar Year or Quarter Ended: June 30, 2009

Check here if Amendment [  ]; Amendment Number:
This Amendment (Check only one.