This file is for parsing the dataset and converting the json data to csv format to make it easier to work with and to do some data cleaning.

Step 1: Importing all necessary libraries.

In [4]:
import os
import pandas as pd
import json

Step 2: importing the data and converting json to csv

In [8]:
folder_path = 'C:/Users/ACER/Desktop/Recipe_Recommendation/recipes_raw_extracted/'
# Get a list of all files in the folder
json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]

print(json_files)

['recipes_raw_nosource_ar.json', 'recipes_raw_nosource_epi.json', 'recipes_raw_nosource_fn.json']


In [9]:
all_recipes_list = []
for json_file in json_files:
    # Read JSON data from file
    file_path = os.path.join(folder_path, json_file)
    with open(file_path, 'r') as file:
        json_data = json.load(file)
     # Convert to Pandas DataFrame
    recipes_list = []

    for recipe_id, recipe_data in json_data.items():
        # Check if all required keys are present in the current recipe
        if all(key in recipe_data for key in ['title', 'ingredients', 'instructions', 'picture_link']):
            recipe = {
                'ID': recipe_id,
                'Title': recipe_data['title'],
                'Ingredients': ', '.join(recipe_data['ingredients']),
                'Instructions': recipe_data['instructions'],
                'Picture_Link': recipe_data['picture_link']
            }
            #Ive chosen to discard missing data at this stage 
            recipes_list.append(recipe)
        else:
            
            print(f"Skipping recipe with ID {recipe_id} in {json_file} due to missing keys.")

    # Extend the all_recipes_list with the recipes from the current file
    all_recipes_list.extend(recipes_list)

Skipping recipe with ID 94V2SSnZJgoSjVtl6U.eIFLpM.8eui6 in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe with ID yza7eX.1Xf5l8UW1FzFzBo/4szXOXtu in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe with ID 5WFnvUWMtWENr9atI0zwBsDXwegyH2W in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe with ID gkJMVZbxTx/u6AmbO5qKV.B7yTCTINu in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe with ID 07q2azUXkshnim3m1z6LB3ZIWKrXSFO in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe with ID ENCqI.wgshWkzXOKVQp919oSY2alT6m in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe with ID u/Y3LJG7sm8thimmJnADLC2Bb0pabYK in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe with ID exQpEXHlppw1uL6wS.qTo0TpGhy6jVO in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe with ID .mLAwpDQJWDST1uXL9JrHSIW9cybClu in recipes_raw_nosource_ar.json due to missing keys.
Skipping recipe wit

In [10]:
#assigning all_recipes_list to a pandas dataframe
df = pd.DataFrame(all_recipes_list)

Inspecting the dataframe

In [11]:
print(df.head(2))
print('-' *10)
print('DF shape:', df.shape)


                                ID                              Title  \
0  rmK12Uau.ntP510KeImX506H6Mr6jTu  Slow Cooker Chicken and Dumplings   
1  5ZpZE8hSVdPk2ZXo1mZTyoPWJRSCPSm      Awesome Slow Cooker Pot Roast   

                                         Ingredients  \
0  4 skinless, boneless chicken breast halves ADV...   
1  2 (10.75 ounce) cans condensed cream of mushro...   

                                        Instructions  \
0  Place the chicken, butter, soup, and onion in ...   
1  In a slow cooker, mix cream of mushroom soup, ...   

                      Picture_Link  
0  55lznCYBbs2mT8BTx6BTkLhynGHzM.S  
1  QyrvGdGNMBA2lDdciY0FjKu.77MM0Oe  
----------
DF shape: (124647, 5)


Checking Missing values

In [None]:
df.isnull().sum()

ID                  0
Title              52
Ingredients         0
Instructions      174
Picture_Link    42054
dtype: int64

I won't be using the pictures link colum so ill drop it and then check missing values

In [12]:
df_cleaned = df.drop(columns='Picture_Link')
print(df_cleaned.isnull().sum())
print('-' *10)
print('DF shape:', df_cleaned.shape)


ID                0
Title            52
Ingredients       0
Instructions    174
dtype: int64
----------
DF shape: (124647, 4)


In [13]:
total_missing_records = df_cleaned[df_cleaned.isnull().any(axis=1)].shape[0]

print(f"Total number of records with missing values: {total_missing_records}")

Total number of records with missing values: 174


Considering the dataset has about 1,20,000+ records,ive decided to drop all null values

In [14]:
df_cleaned= df_cleaned.dropna()
print(df_cleaned.shape)

(124473, 4)


Saving the cleaned dataset in a different file

In [19]:
df_cleaned['Ingredients'] = df_cleaned['Ingredients'].astype(str)
df_cleaned.to_csv('C:/Users/ACER/Desktop/Recipe_Recommendation/jupyter notebooks/cleaned_data.csv')