In [22]:
import pandas as pd
import os
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi
import ast
from sklearn.preprocessing import MultiLabelBinarizer

In [16]:
api=KaggleApi()
api.authenticate()

api.dataset_download_file(
    'paultimothymooney/recipenlg',
    file_name='RecipeNLG_dataset.csv',
    path='../data'
)

original_path = '../data/RecipeNLG_dataset.csv'
zip_path = '../data/RecipeNLG_dataset.zip'
os.rename(original_path, zip_path)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('../data')
os.remove(zip_path)

input_path='../data/RecipeNLG_dataset.csv'
output_path='../data/train-data.csv'

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/recipenlg


In [48]:
print(f'Reading dataset from {input_path}')
data = pd.read_csv(input_path)
print(f'Dataset imported. Shape: {data.shape}')

Reading dataset from ../data/RecipeNLG_dataset.csv
Dataset imported. Shape: (2231142, 7)


In [49]:
print(f'Cleaning dataset...')
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
columns_to_drop = ["Unnamed: 0", "ingredients", "directions", "link", "source"]
data.drop(columns=columns_to_drop, inplace=True)
data.rename(columns={"NER": "ingredients"}, inplace=True)
print(f'Shape: {data.shape}')

Cleaning dataset...
Shape: (2231141, 2)


In [None]:
data = data.head(100)
print(data)

KeyError: 'ingredients'

In [51]:
print('Modifing dataset...')
data['ingredients'] = data['ingredients'].apply(ast.literal_eval)

mlb = MultiLabelBinarizer(sparse_output=True)
ingredient_matrix = mlb.fit_transform(data['ingredients'])

ingredient_df = pd.DataFrame.sparse.from_spmatrix(
    ingredient_matrix, 
    index=data.index, 
    columns=mlb.classes_
)

data = pd.concat([data, ingredient_df], axis=1)
data.drop(columns=['ingredients'], inplace=True)
print(f'Final shape: {data.shape}')

print('Exporting new dataset...')
data.to_csv(output_path)
print(f'New dataset exported to {output_path}')

Modifing dataset...
Final shape: (100, 257)
Exporting new dataset...
New dataset exported to ../data/train-data.csv


In [52]:
data.head(5)

Unnamed: 0,title,Angel,Bisquick,Cheddar cheese,Crisco oil,English peas,Frango,Ground Beef,Italian dressing,Italian seasoning,...,white cake,white corn syrup,white grapes,whole cloves,whole kernel corn,yeast,yellow apples,yellow beans,yellow cake,yellow cake mix
0,No-Bake Nut Cookies,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Jewell Ball'S Chicken,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Creamy Corn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Chicken Funny,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Reeses Cups(Candy),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
