In [1]:
import csv
import json

# Initialize the dictionary with an empty messages list.
data = {"messages": []}

# Open and read the CSV file.
with open("Finance-Data.csv", newline="", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Create a message object for each row.
        message = {
            "role": row["Title"],
            "content": row["Content"]
        }
        data["messages"].append(message)

# Write the resulting JSON to an output file.
with open("output.json", "w", encoding="utf-8") as jsonfile:
    json.dump(data, jsonfile, indent=4)

print("JSON file created successfully!")

JSON file created successfully!


In [9]:
import csv
import json

# Read all messages from the CSV file
messages = []
with open("Finance-Data.csv", newline="", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        message = {
            "role": row["Title"],
            "content": row["Content"]
        }
        messages.append(message)

# Define the number of validation instances
num_validation = 40

# Split messages into validation and training sets.
validation_messages = messages[:num_validation]
training_messages = messages[num_validation:]

# Write training data to a separate file
with open("training.json", "w", encoding="utf-8") as train_file:
    json.dump({"messages": training_messages}, train_file, indent=4)

# Write validation data to a separate file
with open("validation.json", "w", encoding="utf-8") as valid_file:
    json.dump({"messages": validation_messages}, valid_file, indent=4)

print("Training and Validation JSON files created successfully!")

Training and Validation JSON files created successfully!


In [2]:
import csv
import json

# Read all messages from the CSV file
messages = []
with open("Finance-Data.csv", newline="", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        message = {
            "role": row["Title"],
            "content": row["Content"]
        }
        messages.append(message)

# Define the number of validation instances
num_validation = 40

# Split messages into validation and training sets.
validation_messages = messages[:num_validation]
training_messages = messages[num_validation:]

# Write training messages in JSONL format
with open("training.jsonl", "w", encoding="utf-8") as train_file:
    for message in training_messages:
        train_file.write(json.dumps(message) + "\n")

# Write validation messages in JSONL format
with open("validation.jsonl", "w", encoding="utf-8") as valid_file:
    for message in validation_messages:
        valid_file.write(json.dumps(message) + "\n")

print("Training and Validation JSONL files created successfully!")


Training and Validation JSONL files created successfully!


In [3]:
import csv
import json

# Read all messages from the CSV file.
# Here, we assume each row in the CSV represents a complete conversation with one message.
# If you have multi-turn conversations, you'll need to group rows by conversation ID or a similar field.
conversations = []
with open("Finance-Data.csv", newline="", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Each conversation is stored as a JSON object with a "messages" array.
        conversation = {
            "messages": [
                {
                    "role": row["Title"],
                    "content": row["Content"]
                }
            ]
        }
        conversations.append(conversation)

# Define the number of validation instances (first 40 instances for validation).
num_validation = 40
validation_conversations = conversations[:num_validation]
training_conversations = conversations[num_validation:]

# Write training conversations to a JSONL file.
with open("training.jsonl", "w", encoding="utf-8") as train_file:
    for conv in training_conversations:
        train_file.write(json.dumps(conv) + "\n")

# Write validation conversations to a JSONL file.
with open("validation.jsonl", "w", encoding="utf-8") as valid_file:
    for conv in validation_conversations:
        valid_file.write(json.dumps(conv) + "\n")

print("Training and validation JSONL files created successfully!")


Training and validation JSONL files created successfully!


In [14]:
import csv
import json

# Allowed enum values for roles.
valid_roles = {"user", "system", "assistant"}

# File paths (update these if necessary)
csv_file_path = "Finance-Data.csv"
training_output_path = "training.jsonl"
validation_output_path = "validation.jsonl"

# Initialize list to hold conversation objects
conversations = []

# Read CSV file and build conversation objects
with open(csv_file_path, newline="", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Get the role from the Title column
        role = row.get("Title", "").strip()
        # If the role is not valid, default to "assistant" (you can adjust this default as needed)
        if role not in valid_roles:
            role = "assistant"
        # Ensure Content is present
        content = row.get("Content", "").strip()
        if not content:
            continue  # Skip rows with empty content
        # Build the conversation object with a "messages" array
        conversation = {
            "messages": [
                {
                    "role": role,
                    "content": content
                }
            ]
        }
        conversations.append(conversation)

# Define the number of validation instances (first 40 rows will be validation)
num_validation = 40
validation_conversations = conversations[:num_validation]
training_conversations = conversations[num_validation:]

# Write training JSONL file
with open(training_output_path, "w", encoding="utf-8") as train_file:
    for conv in training_conversations:
        # Check that the messages array has at least one message object
        if conv.get("messages") and isinstance(conv["messages"], list) and len(conv["messages"]) >= 1:
            train_file.write(json.dumps(conv) + "\n")

# Write validation JSONL file
with open(validation_output_path, "w", encoding="utf-8") as valid_file:
    for conv in validation_conversations:
        if conv.get("messages") and isinstance(conv["messages"], list) and len(conv["messages"]) >= 1:
            valid_file.write(json.dumps(conv) + "\n")

print("Training and validation JSONL files created successfully!")


Training and validation JSONL files created successfully!


In [15]:
import json

with open("training.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        try:
            data = json.loads(line)
            if not isinstance(data, dict):
                print(f"Line {i} is not a JSON object.")
            elif "messages" not in data:
                print(f"Line {i} missing 'messages' key.")
        except json.JSONDecodeError as e:
            print(f"Line {i} JSON error: {e}")


In [17]:
import csv
import json

def csv_to_jsonl(csv_file, training_file, validation_file, num_validation=40):
    # Read all rows from the CSV file
    with open(csv_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        rows = list(reader)

    # Split data: first num_validation rows go to validation, rest go to training
    validation_data = rows[:num_validation]
    training_data = rows[num_validation:]

    # Write validation data to validation_file in .jsonl format
    with open(validation_file, "w", encoding="utf-8") as val_f:
        for row in validation_data:
            # Here we force the role to be "assistant" (valid for OpenAI),
            # and place Title + Content in content.
            messages = [
                {
                    "role": "assistant",
                    "content": f"{row['Title']}\n\n{row['Content']}"
                }
            ]
            # Dump a single JSON object per line
            val_f.write(json.dumps({"messages": messages}, ensure_ascii=False) + "\n")

    # Write training data to training_file in .jsonl format
    with open(training_file, "w", encoding="utf-8") as train_f:
        for row in training_data:
            messages = [
                {
                    "role": "assistant",
                    "content": f"{row['Title']}\n\n{row['Content']}"
                }
            ]
            train_f.write(json.dumps({"messages": messages}, ensure_ascii=False) + "\n")


if __name__ == "__main__":
    # File paths (adjust as needed)
    input_csv = "data.csv"
    output_training_jsonl = "training.jsonl"
    output_validation_jsonl = "validation.jsonl"

    csv_to_jsonl(
        csv_file=input_csv,
        training_file=output_training_jsonl,
        validation_file=output_validation_jsonl,
        num_validation=40  # number of rows to use for validation
    )
