In [1]:
import os
import pandas as pd

docs = 'C:\\Users\\oyku_\\Desktop\\Recipe Collector\\Recipe-Collector\\pipeline\\data'
dataframes = []

for filename in os.listdir(docs):
    if filename.endswith('.csv'):
        file_path = os.path.join(docs, filename)
        
        try:
            df = pd.read_csv(file_path, delimiter=',', quotechar='"', on_bad_lines='skip')
            dataframes.append((filename, df))
        except Exception as e:
            print(f"Error reading {filename}: {e}")

print(f"Number of DataFrames loaded: {len(dataframes)}")
print("***********************************************************")
for filename, df in dataframes:
    print(f"{filename}: {df.shape[0]} rows")
print("***********************************************************")


for name, df in dataframes:
    print(f"Data from {name}:")
    print(df.head(3))
    print("\n")

combined_df = pd.concat([df for _, df in dataframes], ignore_index=True)

combined_csv_path ='C:\\Users\\oyku_\\Desktop\\Recipe Collector\\Recipe-Collector\\pipeline\\comb_rec.csv'
combined_df.to_csv(combined_csv_path, index=False)

print(f"Combined DataFrame saved to {combined_csv_path}")


Number of DataFrames loaded: 10
***********************************************************
bbc_food_recipes.csv: 24 rows
food_hero.csv: 500 rows
gordon_ramsay_recipes1.csv: 99 rows
Hello_Fresh.csv: 2347 rows
JamieOliverRecipes.csv: 631 rows
Morrisons.csv: 1283 rows
recipes_try.csv: 272 rows
Sainsburys.csv: 1713 rows
Tesco.csv: 739 rows
Treat_Kitchen.csv: 150 rows
***********************************************************
Data from bbc_food_recipes.csv:
                          Title  \
0  A return to the Black Forest   
1   Aberdeen butteries (Rowies)   
2            Ackee and saltfish   

                                         Ingredients  \
0  ['225g/8oz dark chocolate (75 per cent cocoa s...   
1  ['500g/1lb 2oz strong plain flour, plus extra ...   
2  ['1 tbsp vegetable oil', '2 onions, thinly sli...   

                                        Instructions  
0  ['Pre-heat the oven to 180C/350F/Gas 4.', "Bre...  
1  ['In a large bowl, mix together the flour, yea...  
2  ['For t

In [5]:
combined_csv_path = 'C:\\Users\\oyku_\\Desktop\\Recipe Collector\\Recipe-Collector\\pipeline\\comb_rec.csv'
combined_df = pd.read_csv(combined_csv_path)

print(combined_df)

                                                  Title  \
0                          A return to the Black Forest   
1                           Aberdeen butteries (Rowies)   
2                                    Ackee and saltfish   
3     Acqua e sale (Puglian bread salad) with marina...   
4            Acqua pazza with hake, fennel and tomatoes   
...                                                 ...   
7753                                     No Title Found   
7754                                     No Title Found   
7755                                     No Title Found   
7756                                     No Title Found   
7757                              20 MINS 20-25 MINS 18   

                                            Ingredients  \
0     ['225g/8oz dark chocolate (75 per cent cocoa s...   
1     ['500g/1lb 2oz strong plain flour, plus extra ...   
2     ['1 tbsp vegetable oil', '2 onions, thinly sli...   
3     ['2 red onions, finely sliced', '2 tsp soft br...

In [20]:
sample_fraction = 0.1 #10% of the csv file

df_sampled = combined_df.sample(frac=sample_fraction, random_state=42)

print(df_sampled.head())

                                                  Title  \
2157     Roasted Chipotle Cauliflower and Chorizo Tacos   
3704                                      Spiced Mojito   
1323                                      Pasta Recipes   
6683  Chicken and pancetta salad with raspberries an...   
5328                      Emerald dhal with golden tofu   

                                            Ingredients  \
2157  300 grams, Cauliflower Florets, 1 unit(s), Lim...   
3704  Register, 1 handful of ice, 2 dashes of angost...   
1323                                                NaN   
6683  1 x 105g pack cooking smoked pancetta slices, ...   
5328  200g dried Puy lentils, rinsed, 2-2½ tbsp vege...   

                                           Instructions  \
2157  a) Preheat your oven to 220°C/200°C fan/gas ma...   
3704  Step 1Muddle lime wedges with mint and sugar s...   
1323                                                NaN   
6683  Slice the pancetta into strips and dry-fry for..

In [7]:
pd.set_option('display.max_rows', None)  # Set to None to display all rows, or a specific number

print(df_sampled)

                                                  Title  \
2157     Roasted Chipotle Cauliflower and Chorizo Tacos   
3704                                      Spiced Mojito   
1323                                      Pasta Recipes   
6683  Chicken and pancetta salad with raspberries an...   
5328                      Emerald dhal with golden tofu   
997      Recipes Most Popular with HelloFresh Customers   
7659                                  BLUEBERRY & WHITE   
1027              British foods that our customers love   
6620  Roast venison haunch with beetroot, shallots a...   
2973      Turkey breast with sausage & apricot stuffing   
4541                                  Cod Pesto Parcels   
4446                                       Pitta Pizzas   
3154                       Jersey Royal & taleggio tart   
5377  Negroni chicken skewers with fennel and orange...   
4921                                                NaN   
1383        Super Quick Recipes Ready Within 15 Minutes 

# Functions

In [8]:
import re
from textblob import TextBlob


In [21]:
nan_summary = df_sampled.isna().sum()
print("NaN summary for each column:")
print(nan_summary)

print("Columns with all NaN values:", df_sampled.columns[df_sampled.isna().all()].tolist())


NaN summary for each column:
Title                              24
Ingredients                       224
Instructions                      270
Title;Ingredients;Instructions    752
dtype: int64
Columns with all NaN values: []


In [18]:
print("Total rows in DataFrame:", df_sampled.shape[0])
print("Number of NaN values in Ingredients:", df_sampled['Ingredients'].isna().sum())
print("Number of empty strings in Ingredients:", (df_sampled['Ingredients'] == '').sum())


Total rows in DataFrame: 776
Number of NaN values in Ingredients: 224
Number of empty strings in Ingredients: 0


In [12]:
df_sampled.columns

Index(['Title', 'Ingredients', 'Instructions',
       'Title;Ingredients;Instructions'],
      dtype='object')

In [22]:

def drop_nan(df_sampled): #Dropping all the NaN values
    print("Before drop_nan:", df_sampled.shape)

    # df_sampled.dropna(inplace=True) # Okay there is problem here ask this : when i restart the kernel and run the code with this line it drops all the rows but if if first comment oyt this line run it then comment out and run it again it works.

    if 'Title;Ingredients;Instructions' in df_sampled.columns:
        df_sampled.drop(columns=['Title;Ingredients;Instructions'], inplace=True)        

    print("After drop_nan:", df_sampled.shape)
    return df_sampled

def drop_nan_rows(df_sampled):
    df_sampled.dropna(inplace=True)
    print("After drop_row_nan:", df_sampled.shape)
    return df_sampled



def drop_duplicates(df_sampled, subset_rows): # Dropping duplicate recipes
    print("Before drop_duplicates:", df_sampled.shape)
    df_sampled = df_sampled.drop_duplicates(subset=[subset_rows])
    print("After drop_duplicates:", df_sampled.shape)
    return df_sampled



def lower(df_sampled): # Changing all the recipes to lower-case for easier spellcheck
    print("Converting text to lowercase.")
    return df_sampled.applymap(lambda x: x.lower() if isinstance(x, str) else x)



def clean_special_char(df_sampled): # Standariting the formats
    
    print("Cleaning special characters.")
    df_sampled.replace({"½": "1/2", "¼": "1/4", "⅓": "1/3", "¾": "3/4"}, inplace=True)
    
    df_sampled = df_sampled.map(lambda x: re.sub(r'[^a-zA-Z0-9\s.,/&]', '', x) if isinstance(x, str) else x)
    return df_sampled

# def remove_specific (df_sampled):
#     df_sampled['Ingredients'] = df_sampled['Ingredients'].str.rstrip('register')
    
#     return df_sampled

# def spell_check(text): #Spellcheck
#     if isinstance(text, str) and text:  
#         blob = TextBlob(text)
#         return str(blob.correct())
#     return text 


# def apply_spell_check(df_sampled):
#     print("Applying spell check.")
#     for column in df_sampled.columns:
#         if df_sampled[column].dtype == 'object':
#             df_sampled[column] = df_sampled[column].apply(spell_check)
#     return df_sampled

def remove_bullet(text): # Removing bullet (if i wanna change them with something i can add that to '')
    if isinstance(text, str):
        bullets_to_remove =  r'\b[a-z]\)|\b(step\s*\d+)\b|\b(step\s*\d+)\w*' #r'\b[a-z]\)|\bstep\s*[1-9]?\d\b' it was like this but since step 1 .. are embedded into the next word it needed to be modified
        return re.sub(bullets_to_remove, '', text, flags=re.IGNORECASE).strip()
    return text

def apply_remove_bullet(df_sampled):
    print("Removing bullets.")
    for column in df_sampled.columns:
        if df_sampled[column].dtype == 'object':
            df_sampled[column] = df_sampled[column].apply(remove_bullet)
    return df_sampled

def remove_word(df_sampled):
    for column in df_sampled.columns:
        if df_sampled[column].dtype == 'object':
            df_sampled[column] = df_sampled[column].str.replace('register,', '', case=False, regex=True).str.strip()
    return df_sampled

def drop_words_rows(df_sampled):
    mask = df_sampled.apply(lambda  row: row.astype(str).str.contains('Terms', case=False).any(), axis=1)
    return df_sampled[~mask]
    



def data_pipeline(df_sampled, subset_rows):
    print("Starting data pipeline")
    
    df_sampled = drop_nan(df_sampled)
    df_sampled = drop_nan_rows(df_sampled)
    df_sampled = drop_duplicates(df_sampled, subset_rows)
    df_sampled = lower(df_sampled)
    df_sampled = clean_special_char(df_sampled)
    df_sampled = apply_remove_bullet(df_sampled)
    df_sampled = remove_word(df_sampled)
    df_sampled = drop_words_rows(df_sampled)
    # df_sampled = apply_spell_check(df_sampled)
    
    print("Final shape of DataFrame:", df_sampled.shape)
    return df_sampled



df_sample_cleaned = data_pipeline(df_sampled, 'Title')
print("Cleaned DataFrame:")
print(df_sample_cleaned)


Starting data pipeline
Before drop_nan: (776, 4)
After drop_nan: (776, 3)
After drop_row_nan: (505, 3)
Before drop_duplicates: (505, 3)
After drop_duplicates: (485, 3)
Converting text to lowercase.
Cleaning special characters.
Removing bullets.
Final shape of DataFrame: (426, 3)
Cleaned DataFrame:
                                                  Title  \
2157     roasted chipotle cauliflower and chorizo tacos   
3704                                      spiced mojito   
6683  chicken and pancetta salad with raspberries an...   
5328                      emerald dhal with golden tofu   
7659                                  blueberry & white   
6620  roast venison haunch with beetroot, shallots a...   
4541                                  cod pesto parcels   
4446                                       pitta pizzas   
5377  negroni chicken skewers with fennel and orange...   
2799      indonesian style chicken breast and veg curry   
4701                  chinesestyle beef in oyster sa

  return df_sampled.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [10]:
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.max_rows', None)     
pd.set_option('display.max_columns', None)

In [23]:
df_sample_cleaned.head(20)

Unnamed: 0,Title,Ingredients,Instructions
2157,roasted chipotle cauliflower and chorizo tacos,"300 grams, cauliflower florets, 1 units, lime,...",a preheat your oven to 220c/200c fan/gas mark ...
3704,spiced mojito,"1 handful of ice, 2 dashes of angostura bitter...",lime wedges with mint and sugar syrup in a hig...
6683,chicken and pancetta salad with raspberries an...,"1 x 105g pack cooking smoked pancetta slices, ...",slice the pancetta into strips and dryfry for ...
5328,emerald dhal with golden tofu,"200g dried puy lentils, rinsed, 22 tbsp vegeta...",tip the rinsed lentils into a saucepan and cov...
7659,blueberry & white,in the kit bag 1 160g caster sugar bag 2 22...,this kit are recyclable this kit are recyclabl...
6620,"roast venison haunch with beetroot, shallots a...","1 x 500g bunch of beetroot, trimmed, peeled an...","preheat the oven to 200c, fan 180c, gas 6. put..."
4541,cod pesto parcels,"4 tbsp of pesto, 4 fresh cod fillets, approx 1...",out 4 squares of foil. drizzle a little oil on...
4446,pitta pizzas,"60 g of grated cheddar cheese, 80 g of tomato ...",the oven to 190oc.. or toast the pittas cook...
5377,negroni chicken skewers with fennel and orange...,"1kg chicken thigh fillets, cut into chunks, 1 ...","for the skewers, put the chicken pieces in a l..."
2799,indonesian style chicken breast and veg curry,"300, cauliflower florets, 1, garlic clove, 150...",preheat your oven to 220c/200c fan/gas mark 7....


In [12]:

# row_labels = [4541, 2973, 2996]  # Replace with actual labels
# print(df_sampled.loc[row_labels])