Library used to manipulate data

In [None]:
import os
import re
import zipfile
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import csv
import json


Unzip data

In [None]:
zip_file = "DatasetMasakanIndo.zip"
OutputAfter = "DatasetMasakanIndo"

os.makedirs(OutputAfter, exist_ok=True)

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(OutputAfter)
    print(f"The file was successfully extracted to the folder: {OutputAfter}")


View some data contents and how much data

In [None]:
df = pd.read_csv("DatasetMasakanIndo/dataset-ayam.csv")

df.tail(5)

Since we couldn't find the data we were looking for, we decided to manually label it ourselves. 

Therefore, we minimize the data.

The next process is to delete the Missing Value data, Duplicate data, truncate the data as mentioned earlier, and save the new data to a new folder.

In [None]:
DirectOriData = 'DatasetMasakanIndo'
OutputAfter = 'NewData'


if not os.path.exists(OutputAfter):
    os.makedirs(OutputAfter)

for file in os.listdir(DirectOriData):
    if file.endswith('.csv'):
        file_path = os.path.join(DirectOriData, file)
        print(f"{file} file, in process")

        df = pd.read_csv(file_path)
        df_cleaned = df.dropna().drop_duplicates()
        df_cleaned = df_cleaned.head(1000)
        
        output_path = os.path.join(OutputAfter, file)
        df_cleaned.to_csv(output_path, index=False)
        print(f"{file} file, successfully saved in {OutputAfter}/")


Next, each file creates a Category column, where the contents of the category are based on the file name.

In [None]:
dataNew = 'NewData'
OutputAfter = 'NewData_Category'

if not os.path.exists(OutputAfter):
    os.makedirs(OutputAfter)

for file in os.listdir(dataNew):
    if file.endswith('.csv'):
        file_path = os.path.join(dataNew, file)
        print(f"{file} file, in process")
        
        df = pd.read_csv(file_path)
        
        if 'ayam' in file:
            kategori = 'ayam'
        elif 'ikan' in file:
            kategori = 'ikan'
        elif 'kambing' in file:
            kategori = 'kambing'
        elif 'sapi' in file:
            kategori = 'sapi'
        elif 'tahu' in file:
            kategori = 'tahu'
        elif 'telur' in file:
            kategori = 'telur'
        elif 'tempe' in file:
            kategori = 'tempe'
        elif 'udang' in file:
            kategori = 'udang'
        
        df.insert(0, 'Category', kategori)
        
        output_path = os.path.join(OutputAfter, file)
        df.to_csv(output_path, index=False)
        print(f"{file} file, successfully saved in {OutputAfter}/")


Followed by merging all the files, which will be given an Id

In [None]:
dataNew = 'NewData_Category'
OutputAfter = 'NewData_Cate_IdNull.csv'

all_data = []

for file in os.listdir(dataNew):
    if file.endswith('.csv'):
        file_path = os.path.join(dataNew, file)
        print(f"{file} file merging process")
        
        df = pd.read_csv(file_path)
        all_data.append(df)

merged_data = pd.concat(all_data, ignore_index=True)

merged_data.to_csv(OutputAfter, index=False)

Next, the process performs
1. Id creation and assignment
2. Putting the Id column at the front

In [None]:
dataNew = 'NewData_Cate_IdNull.csv'

df = pd.read_csv(dataNew)

df['Id'] = range(1, len(df) + 1)

cols = ['Id'] + [col for col in df.columns if col != 'Id']
df = df[cols]

df.to_csv('NewData_Cate_Id.csv', index=False)

Next, the process performs
1. Deletion of unused data columns, i.e. Loves column
2. Cleaning the data in the Title, Ingredients, and Steps columns from emots
3. Title cleaning, so that only words, numbers, and spaces are used.
4. Making every word capitalized in the Title column
5. Discarding data that has “\n”

In [None]:
dataNew = "NewData_Cate_Id.csv"
df = pd.read_csv(dataNew)

df = df.drop(columns=['Loves'])

def remove_emot(text):
    regex_pattern = r"[^\w\s.,'\"!?;:@#$%^&*()\-+=<>/\\|`~\[\]{}]"
    return re.sub(regex_pattern, '', text)

def clean_title(title):
    regex_pattern = r"[^\w\s]"
    title = re.sub(regex_pattern, '', title)
    return title

def capitalize_title(title):
    return ' '.join(word.capitalize() for word in title.split())

columns = ['Title', 'Ingredients', 'Steps']
for i in columns:
    df[i] = df[i].astype(str).apply(remove_emot)

df['Title'] = df['Title'].astype(str).apply(clean_title)
df['Title'] = df['Title'].astype(str).apply(capitalize_title)

columns = ['Ingredients', 'Steps']
for i in columns:
    df = df[~df[i].str.contains(r'\n', na=False)]

df.to_csv("NewData_Cate_Id_Clean.csv", index=False)

Next, add Type and Temp(cold) columns

In [None]:
dataNew = 'NewData_Cate_Id_Clean.csv'
OutputAfter = 'NewData_Uncomplete.csv'

df = pd.read_csv(dataNew)

new_colomn = ['Type', 'Temp(cold)']
for colomn in new_colomn:
    df[colomn] = None


df.to_csv(OutputAfter, index=False)

print(f"CSV file with new columns successfully saved to {OutputAfter}")


Next, break down the data again according to Category

In [None]:
dataNew = 'NewData_Uncomplete.csv'
df = pd.read_csv(dataNew)

OutputAfter = 'NewData_Category_Id_Clean'

if not os.path.exists(OutputAfter):
    os.makedirs(OutputAfter)

for category, group in df.groupby('Category'):
    output_file = os.path.join(OutputAfter, f"{category}_data.csv")

    group.to_csv(output_file, index=False)
    print(f"The data for category '{category}' has been saved to '{output_file}'")


In the dataset, there is a URL column that contains the recipe link, but only the back link, not the base url.

Therefore, we try to search for the base url.

Once obtained, the next process is to convert the recipe link into a recipe image link. In this case, we enter the link, and then search for the tag of the image element that we want to capture.

In [None]:
# Change each data, for example ayam => ikan, and run this code again
#If you want a fast process, we recommend creating a new ipynb file, and copy this code with different data

data = "ayam"
dataNew = f'NewData_Category_Id_Clean/{data}_data.csv'
OutputAfter = 'NewData_Category_Id_Url_Clean'

if not os.path.exists(OutputAfter):
    os.makedirs(OutputAfter)

OutputAfter_name = os.path.join(OutputAfter, f'{data}_data_url.csv')

base_url = "https://cookpad.com"

def fetch_image_url(relative_url):
    full_url = base_url + relative_url
    try:
        response = requests.get(full_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            image_div = soup.find('div', class_='tofu_image')
            if image_div:
                img_tag = image_div.find('img')
                if img_tag and 'src' in img_tag.attrs:
                    return img_tag['src']
        return None
    except Exception as e:
        print(f"Error fetching URL {full_url}: {e}")
        return None

df = pd.read_csv(dataNew)

if 'URL' in df.columns:
    updated_urls = []
    for idx, relative_url in enumerate(df['URL']):
        print(f"Processing row {idx + 1}/{len(df)}: {relative_url}")
        img_url = fetch_image_url(relative_url)
        updated_urls.append(img_url)
        time.sleep(2)
    
    df['URL'] = updated_urls
    
    df.to_csv(OutputAfter_name, index=False)
    print(f"File successfully updated and saved to: {OutputAfter}")
else:
    print("The 'URL' field was not found in the file.")


Next, the process performs
1. Scraping the image from the link that has been obtained
2. Changing the URL field with the same value as the image file name
3. The whole process is made into a new file

In [None]:
#This code is quite time-consuming, we recommend doing it as in the previous code

data = "ayam"
dataNew = f"NewData_Category_Id_Url_Clean/{data}_data_url.csv"
OutputAfter = f"DataPhoto/Photo_{data}"
OutputAfter2 = "Data_Complete"

df = pd.read_csv(dataNew)

if not os.path.exists(OutputAfter):
    os.makedirs(OutputAfter)

if not os.path.exists(OutputAfter2):
    os.makedirs(OutputAfter2)

def format_title(title):
    return title.lower().replace(" ", "_")

for index, row in df.iterrows():
    try:
        id_ = str(row["Id"])
        title_ = format_title(row["Title"])
        url = row["URL"]
        
        Fname = f"{id_}_{title_}.jpg"
        file_path = os.path.join(OutputAfter, Fname)
        
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(response.content)
            print(f"Image downloaded successfully: {Fname}")
            
            df.at[index, "URL"] = Fname
        else:
            print(f"Failed to download image from URL: {url} (Status code: {response.status_code})")
            
            df.at[index, "URL"] = np.nan
        
        time.sleep(2)
    except Exception as e:
        print(f"Error on Id {id_}: {e}")
        df.at[index, "URL"] = np.nan

output_csv = os.path.join(OutputAfter2, f"{data}_data_complete.csv")
df.to_csv(output_csv, index=False)
print("Process completed.")


Downloaded photo files are zipped so that they can be uploaded to the database.

Finally, manually assign values to the Type and Temp(cold) columns, which are coded in EditData.py

After editing each category with maximum effort, proceed with the process of
1. Merging all categories
2. Deleting Missing Value data and Duplicate data again
3. Saving it into Final_Dataset

In [None]:
dataNew = 'Data_Complete/data_complete'
OutputAfter = 'Final_Dataset.csv'

all_data = []

for file in os.listdir(dataNew):
    if file.endswith('.csv'):
        file_path = os.path.join(dataNew, file)
        print(f"{file} file merging process")
        
        df = pd.read_csv(file_path)
        all_data.append(df)

merged_data = pd.concat(all_data, ignore_index=True)

merged_data = merged_data.dropna()

merged_data = merged_data.drop_duplicates()

merged_data.to_csv(OutputAfter, index=False)
print(f"New data has been saved to: {OutputAfter}")


The dataset was changed slightly because the symbol ( ; ) at Ingredients and Steps columns is quite problematic for the Backend.

In [None]:
dataNew = "Final_Dataset.csv"
OutputAfter = "Final_Dataset1.csv"

df = pd.read_csv(dataNew)

df = df.map(lambda x: str(x).replace(';', ':') if isinstance(x, str) else x)

df.to_csv(OutputAfter, index=False)
print(f"All ';' in the dataset has been replaced with ':'. Results are stored in: {OutputAfter}")


Next, convert the dataset, which is still in CSV form, into JSON form, to make it easier to upload data to the database.

In [None]:
dataNew = 'Final_Dataset1.csv'

OutputAfter = "Final_Dataset1.json"

with open(dataNew, mode='r', newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile, delimiter=',')

    json_data = []
    
    for row in reader:
        print(f"Header: {reader.fieldnames}")
        print(f"Row Data: {row}")

        record = {
            "Id": row.get("Id", "").strip(),
            "Category": row.get("Category", "").strip(),
            "Title": row.get("Title", "").strip(),
            "Ingredients": row.get("Ingredients", "").strip(),
            "Steps": row.get("Steps", "").strip(),
            "URL": row.get("URL", "").strip(),
            "Type": row.get("Type", "").strip(),
            "Temp(cold)": row.get("Temp(cold)", "").strip()
        }
        
        print(f"Record processed: {record}")
        
        if any(record.values()):
            json_data.append(record)
        else:
            print("This line is empty and not entered:", record)

    with open(OutputAfter, mode='w', encoding='utf-8') as outfile:
        json.dump(json_data, outfile, indent=4, ensure_ascii=False)

print(f"The file has been successfully processed and saved as '{OutputAfter}'")
