# Converting JSON data to CSV
## For training Linguistic Model.



In [None]:
import csv
import json

# Load JSON data
with open('test.json', 'r') as json_file:
    data = json.load(json_file)

# Define CSV file path
csv_file_path = 'data.csv'

# Define fieldnames for CSV header
fieldnames = ['recipe_id', 'context_modality', 'split', 'context_id', 'context_title', 'context_body', 'choice_list', 'answer', 'qid', 'question_modality', 'question', 'task']

# Open CSV file in write mode and write header
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    # Iterate through each item in JSON data
    for item in data['data']:
        recipe_id = item['recipe_id']
        context_modality = '|'.join(item['context_modality'])
        split = item['split']
        choice_list = '|'.join(str(choice) for choice in item['choice_list'])  # Convert each choice to string before joining
        answer = item['answer']
        qid = item['qid']
        question_modality = '|'.join(item['question_modality'])
        task = item['task']

        # Iterate through each context in the item
        for context in item['context']:
            context_id = context['id']
            context_title = context.get('title', '')  # Get title or empty string if it doesn't exist
            context_body = context['body']

            # Write row to CSV file
            writer.writerow({
                'recipe_id': recipe_id,
                'context_modality': context_modality,
                'split': split,
                'context_id': context_id,
                'context_title': context_title,
                'context_body': context_body,
                'choice_list': choice_list,
                'answer': answer,
                'qid': qid,
                'question_modality': question_modality,
                'question': '|'.join(item['question']),
                'task': task
            })

print("CSV conversion completed!")


CSV conversion completed!


## Dropping Unnecessary Columns

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('data.csv')
textual_cloze_df = df[df['task'] == 'textual_cloze']
textual_cloze_df.drop(columns=['context_modality', 'split', 'context_id', 'context_title','qid','question_modality','task','choice_list','answer','question'], inplace=True)
# Print the head of the data
textual_cloze_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textual_cloze_df.drop(columns=['context_modality', 'split', 'context_id', 'context_title','qid','question_modality','task','choice_list','answer','question'], inplace=True)


Unnamed: 0,recipe_id,context_body
11770,simple-all-grain-electric-beer-brewery-biab,1) Kettle ($35-$155) At my local restaurant su...
11771,simple-all-grain-electric-beer-brewery-biab,Hole in electrical box Using the instructions ...
11772,simple-all-grain-electric-beer-brewery-biab,Now that the electrical box has a large hole i...
11773,simple-all-grain-electric-beer-brewery-biab,Decide where you want the hole in your kettle ...
11774,simple-all-grain-electric-beer-brewery-biab,"After your JB Weld has completely set, you are..."


In [None]:
textual_cloze_df.tail()

Unnamed: 0,recipe_id,context_body
18521,how-to-cook-the-french-mills-crepes-cake,1. Add 2 tablespoons of batter to the center ...
18522,how-to-cook-the-french-mills-crepes-cake,"1. Make the cream. Add the milk, eggs, sugar a..."
18523,how-to-cook-the-french-mills-crepes-cake,1. Put a crepe on a plate. Then add some cream...
18524,how-to-cook-the-french-mills-crepes-cake,1. Sprinkle a few green tea powders on the top...
18525,how-to-cook-the-french-mills-crepes-cake,Congregation! When the cream freezes...


## Combining the data with same recipe ids

In [None]:
textual_cloze_df['context_body'] = textual_cloze_df['context_body'].fillna('')

# Combine rows with the same 'recipe_id'
combined_df = textual_cloze_df.groupby('recipe_id')['context_body'].agg(lambda x: '|'.join(x)).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textual_cloze_df['context_body'] = textual_cloze_df['context_body'].fillna('')


In [None]:
combined_df.head()

Unnamed: 0,recipe_id,context_body
0,1-gallon-batch-of-cherry-mead-cherry-melomel,Not all of these materials will be needed init...
1,1-popcorn-machine,Take an empty soda can and remove the paint wi...
2,10-minute-toffee,Ingredients:\n1 cup raw pralines\n1 cup granul...
3,100-calorie-peanut-butter-chocolate-popsicles,Ingredients: 1 chocolate popsicle: - 2 tablesp...
4,1up-mushroom-mushroom-burger,"First, wash your hands!\nThen wash your produc..."


In [None]:
combined_df.to_csv('test.csv', index=False)

# Converting CSV data to JSONL
## For fine tuning the GPT 3.5 turbo model.

## Change the column names so it can be read by gpt models.

In [None]:
# Read train.csv, val.csv, and test.csv
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

# Rename columns
train_df.columns = ['prompt', 'completion']
val_df.columns = ['prompt', 'completion']
test_df.columns = ['prompt', 'completion']

In [None]:
val_df = val_df.head(100)

In [None]:
val_df.head(11)

Unnamed: 0,prompt,completion
0,-homebrew-honey-beer-,Ingredients: - 2kg malt; - 1kg honey; - 100g t...
1,1-mango-jalapeno-jam,1 mangoe1-2 jalapenos1/8-1/4 cup sugar or hone...
2,10-easy-snacks-for-toddler-family-1-snack-for-...,Recipe #1 - Vegetable Shapes Patty ( cutlet )I...
3,100-percent-figs-and-berries-roll-ups,Place fruits into blender in the order of stra...
4,3-ingredient-no-bake-oreo-cheesecake,If you like to learn how to do things through ...
5,3-ingredient-pancake-balls,Non-Stick Cooking Spray 1 Egg 1 Banana|Liberal...
6,3-ingredients-rice-crispy,You will need:-about 1.5 cups of small marshma...
7,30-minute-dinner-rolls,Ingredients: (makes 12 rolls)1 cup warm water2...
8,314-pi-cake,- 2 large bowls - ...
9,3d-butterfly-mini-cupcakes,Here's what you will need...For the wings: C...


## Mapping CSV to JSONL using chat conversation format.

In [None]:
# Function to convert DataFrame row to JSON format
def row_to_jsonl(row):
    messages = [
        {"role": "user", "content": row['prompt']},
        {"role": "assistant", "content": row['completion']}
    ]
    return {"messages": messages}


# Save val_renamed.jsonl
with open('val_renamed.jsonl', 'w') as f:
    for _, row in val_df.iterrows():
        json.dump(row_to_jsonl(row), f)
        f.write('\n')

print("JSONL files saved successfully!")


JSONL files saved successfully!


In [None]:

# Function to read and print JSONL file
def print_jsonl_file(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            print(data)


# View val_renamed.jsonl
print("\nValidation data:")
print_jsonl_file('val_renamed.jsonl')




Validation data:
{'messages': [{'role': 'user', 'content': '-homebrew-honey-beer-'}, {'role': 'assistant', 'content': "Ingredients: - 2kg malt; - 1kg honey; - 100g to 135g dry (wild or purchased) hops, depending on how hoppy you want the beer; - sugar (for sparkling); - 7g yeast (1 sachet) or brewing yeast; - water. Tools (look at drawing): 1- 25liter bucket for brewing beer with a bubbler, 25 liter bucket wit a tap; 2- a big pan (at least 15 liter capacity), a small pan (at least 7 liter capacity); 3- a big stainer (fine) or https://www.instructables.com/id/Simple-Filter-for-Brewing/; 4- a food safe hose; 5- funnel; 6- teaspoon; 7- sodium bisulfite (for disinfect); 8- beer densimeter; 9- beer bottles (I used 75cl bottles).|Boil the hops in a big pan with 10/15 liter of water for 20 or 30 minutes, than allowed to cool down to 22° C, with a top on.|Heat up a little bit the honey and the malt (''bain marie'' or microwave), for an easier pouring and mixing. Slowly add them to 3 liter of 