### Library loading

In [1]:
import pandas as pd
import configparser

### Paths loading

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')

full_raw_dataset_path = config['PATHS']['full_raw_dataset_path']
prepared_dataset_path = config['PATHS']['prepared_dataset_path']


### Data loading and preprocessing

In [3]:
df = pd.read_csv(full_raw_dataset_path)
df.shape

(2231142, 6)

In [4]:
df.groupby('source').agg({'title':'count'})

Unnamed: 0_level_0,title
source,Unnamed: 1_level_1
Gathered,1643097
Recipes1M,588044


In [5]:
import ast
import numpy as np

######## Convert 'ingredients' and 'directions' columns from string to list
df['ingredients_list'] = df['ingredients'].map(ast.literal_eval)
df['directions_list'] = df['directions'].map(ast.literal_eval)

######## Create input_text and target_text using vectorized operations
df['input_text'] = df['ingredients_list'].map(lambda x: ", ".join(x).lower())
df['target_text'] = df['directions_list'].map(lambda x: " ".join(x).lower())

######## Replace empty strings with NaN
df['input_text'] = df['input_text'].replace('', np.nan)
df['target_text'] = df['target_text'].replace('', np.nan)
df['target_text'] = df['target_text'].str.replace('\x00', '', regex=False)
df.dropna(how='any', inplace=True)

######## Select final processed columns
df_processed = df[['input_text', 'target_text']]
df_processed.head()


Unnamed: 0,input_text,target_text
0,"1 c. firmly packed brown sugar, 1/2 c. evapora...","in a heavy 2-quart saucepan, mix brown sugar, ..."
1,"1 small jar chipped beef, cut up, 4 boned chic...",place chipped beef on bottom of baking dish. p...
2,"2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. cr...","in a slow cooker, combine all ingredients. cov..."
3,"1 large whole chicken, 2 (10 1/2 oz.) cans chi...",boil and debone chicken. put bite size pieces ...
4,"1 c. peanut butter, 3/4 c. graham cracker crum...",combine first four ingredients and press in 13...


In [6]:
######## filtering out unwanted text ########

garbage_text = ['none', 'n/a', 'copyright', 'NA', 'nan']

df_processed = df_processed[~df_processed['input_text'].isin(garbage_text)]
df_processed = df_processed[~df_processed['target_text'].isin(garbage_text)]
df_processed.shape

(2231127, 2)

In [7]:
######## filtering out text having length less than 3 in input_text ########

df_processed = df_processed[df_processed['input_text'].str.len()>2]
df_processed.shape

(2231114, 2)

In [8]:
######## filtering out text having length less than 50 in target_text ########

df_processed = df_processed[df_processed['target_text'].str.len()>50]
df_processed.shape

(2197013, 2)

In [9]:
######## Randomly choosing 10K records for training and validation

df_processed = df_processed.sample(10000,random_state=0)
df_processed.shape

(10000, 2)

In [10]:
df_processed.to_csv(prepared_dataset_path, index=False)