In [1]:
import os
import pandas as pd

In [2]:
docs = '../pipeline/data'
dataframes = []

for filename in os.listdir(docs):
    if filename.endswith('.csv'):
        file_path = os.path.join(docs, filename)
        
        try:
            df = pd.read_csv(file_path, delimiter=',', quotechar='"', on_bad_lines='skip')
            dataframes.append((filename, df))
        except Exception as e:
            print(f"Error reading {filename}: {e}")

print(f"Number of DataFrames loaded: {len(dataframes)}")
print("***********************************************************")
for filename, df in dataframes:
    print(f"{filename}: {df.shape[0]} rows")
print("***********************************************************")


for name, df in dataframes:
    print(f"Data from {name}:")
    print(df.head(3))
    print("\n")

combined_df = pd.concat([df for _, df in dataframes], ignore_index=True)

combined_csv_path ='../pipeline/comb_rec.csv'
combined_df.to_csv(combined_csv_path, index=False)

print(f"Combined DataFrame saved to {combined_csv_path}")

Number of DataFrames loaded: 10
***********************************************************
bbc_food_recipes.csv: 24 rows
food_hero.csv: 500 rows
gordon_ramsay_recipes1.csv: 99 rows
Hello_Fresh.csv: 2347 rows
JamieOliverRecipes.csv: 631 rows
Morrisons.csv: 1283 rows
recipes_try.csv: 272 rows
Sainsburys.csv: 1713 rows
Tesco.csv: 739 rows
Treat_Kitchen.csv: 150 rows
***********************************************************
Data from bbc_food_recipes.csv:
                          Title  \
0  A return to the Black Forest   
1   Aberdeen butteries (Rowies)   
2            Ackee and saltfish   

                                         Ingredients  \
0  ['225g/8oz dark chocolate (75 per cent cocoa s...   
1  ['500g/1lb 2oz strong plain flour, plus extra ...   
2  ['1 tbsp vegetable oil', '2 onions, thinly sli...   

                                        Instructions  
0  ['Pre-heat the oven to 180C/350F/Gas 4.', "Bre...  
1  ['In a large bowl, mix together the flour, yea...  
2  ['For t

In [3]:
combined_csv_path = '../pipeline/comb_rec.csv'
combined_df = pd.read_csv(combined_csv_path)

print(combined_df)

                                                  Title  \
0                          A return to the Black Forest   
1                           Aberdeen butteries (Rowies)   
2                                    Ackee and saltfish   
3     Acqua e sale (Puglian bread salad) with marina...   
4            Acqua pazza with hake, fennel and tomatoes   
...                                                 ...   
7753                                     No Title Found   
7754                                     No Title Found   
7755                                     No Title Found   
7756                                     No Title Found   
7757                              20 MINS 20-25 MINS 18   

                                            Ingredients  \
0     ['225g/8oz dark chocolate (75 per cent cocoa s...   
1     ['500g/1lb 2oz strong plain flour, plus extra ...   
2     ['1 tbsp vegetable oil', '2 onions, thinly sli...   
3     ['2 red onions, finely sliced', '2 tsp soft br...

In [4]:
sample_fraction = 0.1 #10% of the csv file

df_sampled = combined_df.sample(frac=sample_fraction, random_state=42)

print(df_sampled.head())

                                                  Title  \
2157     Roasted Chipotle Cauliflower and Chorizo Tacos   
3704                                      Spiced Mojito   
1323                                      Pasta Recipes   
6683  Chicken and pancetta salad with raspberries an...   
5328                      Emerald dhal with golden tofu   

                                            Ingredients  \
2157  300 grams, Cauliflower Florets, 1 unit(s), Lim...   
3704  Register, 1 handful of ice, 2 dashes of angost...   
1323                                                NaN   
6683  1 x 105g pack cooking smoked pancetta slices, ...   
5328  200g dried Puy lentils, rinsed, 2-2½ tbsp vege...   

                                           Instructions  \
2157  a) Preheat your oven to 220°C/200°C fan/gas ma...   
3704  Step 1Muddle lime wedges with mint and sugar s...   
1323                                                NaN   
6683  Slice the pancetta into strips and dry-fry for..

In [5]:
pd.set_option('display.max_rows', None)  # Set to None to display all rows, or a specific number

print(df_sampled)

                                                  Title  \
2157     Roasted Chipotle Cauliflower and Chorizo Tacos   
3704                                      Spiced Mojito   
1323                                      Pasta Recipes   
6683  Chicken and pancetta salad with raspberries an...   
5328                      Emerald dhal with golden tofu   
997      Recipes Most Popular with HelloFresh Customers   
7659                                  BLUEBERRY & WHITE   
1027              British foods that our customers love   
6620  Roast venison haunch with beetroot, shallots a...   
2973      Turkey breast with sausage & apricot stuffing   
4541                                  Cod Pesto Parcels   
4446                                       Pitta Pizzas   
3154                       Jersey Royal & taleggio tart   
5377  Negroni chicken skewers with fennel and orange...   
4921                                                NaN   
1383        Super Quick Recipes Ready Within 15 Minutes 

# Functions

In [6]:
import re
from textblob import TextBlob


In [8]:
combined_df.shape

(7758, 4)

In [9]:
nan_summary = combined_df.isna().sum()
print("NaN summary for each column:")
print(nan_summary)

print("Columns with all NaN values:", combined_df.columns[combined_df.isna().all()].tolist())


NaN summary for each column:
Title                              272
Ingredients                       2144
Instructions                      2608
Title;Ingredients;Instructions    7486
dtype: int64
Columns with all NaN values: []


In [10]:
print("Total rows in DataFrame:", combined_df.shape[0])
print("Number of NaN values in Ingredients:", combined_df['Ingredients'].isna().sum())
print("Number of empty strings in Ingredients:", (combined_df['Ingredients'] == '').sum())


Total rows in DataFrame: 7758
Number of NaN values in Ingredients: 2144
Number of empty strings in Ingredients: 0


In [11]:
combined_df.columns

Index(['Title', 'Ingredients', 'Instructions',
       'Title;Ingredients;Instructions'],
      dtype='object')

In [12]:

def drop_nan(combined_df):
    print("Before drop_nan:", combined_df.shape)

    # df_sampled.dropna(inplace=True) # Okay there is problem here ask this : when i restart the kernel and run the code with this line it drops all the rows but if if first comment oyt this line run it then comment out and run it again it works.

    if 'Title;Ingredients;Instructions' in combined_df.columns:
        combined_df.drop(columns=['Title;Ingredients;Instructions'], inplace=True)        

    print("After drop_nan:", combined_df.shape)
    return combined_df

def drop_nan_rows(combined_df):
    combined_df.dropna(inplace=True)
    print("After drop_row_nan:", combined_df.shape)
    return combined_df



def drop_duplicates(combined_df, subset_rows):
    print("Before drop_duplicates:", combined_df.shape)
    combined_df = combined_df.drop_duplicates(subset=[subset_rows])
    print("After drop_duplicates:", combined_df.shape)
    return combined_df



def lower(combined_df): 
    print("Converting text to lowercase.")
    return combined_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)



def clean_special_char(combined_df):
    
    print("Cleaning special characters.")
    combined_df.replace({"½": "1/2", "¼": "1/4", "⅓": "1/3", "¾": "3/4"}, inplace=True)
    
    combined_df = combined_df.map(lambda x: re.sub(r'[^a-zA-Z0-9\s.,/&]', '', x) if isinstance(x, str) else x)
    return combined_df

# def remove_specific (df_sampled):
#     df_sampled['Ingredients'] = df_sampled['Ingredients'].str.rstrip('register')
    
#     return df_sampled

# def spell_check(text): #Spellcheck
#     if isinstance(text, str) and text:  
#         blob = TextBlob(text)
#         return str(blob.correct())
#     return text 


# def apply_spell_check(df_sampled):
#     print("Applying spell check.")
#     for column in df_sampled.columns:
#         if df_sampled[column].dtype == 'object':
#             df_sampled[column] = df_sampled[column].apply(spell_check)
#     return df_sampled

def remove_bullet(text): 
    if isinstance(text, str):
        bullets_to_remove =  r'\b[a-z]\)|\b(step\s*\d+)\b|\b(step\s*\d+)\w*' #r'\b[a-z]\)|\bstep\s*[1-9]?\d\b' it was like this but since step 1 .. are embedded into the next word it needed to be modified
        return re.sub(bullets_to_remove, '', text, flags=re.IGNORECASE).strip()
    return text

def apply_remove_bullet(combined_df):
    print("Removing bullets.")
    for column in combined_df.columns:
        if combined_df[column].dtype == 'object':
            combined_df[column] = combined_df[column].apply(remove_bullet)
    return combined_df

def remove_word(combined_df):
    for column in combined_df.columns:
        if combined_df[column].dtype == 'object':
            combined_df[column] = combined_df[column].str.replace('register,', '', case=False, regex=True).str.strip()
    return combined_df

def drop_words_rows(combined_df):
    mask = combined_df.apply(lambda  row: row.astype(str).str.contains('Terms', case=False).any(), axis=1)
    return combined_df[~mask]
    



def data_pipeline(combined_df, subset_rows):
    print("Starting data pipeline")
    
    combined_df = drop_nan(combined_df)
    combined_df = drop_nan_rows(combined_df)
    combined_df = drop_duplicates(combined_df, subset_rows)
    combined_df = lower(combined_df)
    combined_df = clean_special_char(combined_df)
    combined_df = apply_remove_bullet(combined_df)
    combined_df = remove_word(combined_df)
    combined_df = drop_words_rows(combined_df)
    # df_sampled = apply_spell_check(df_sampled)
    
    print("Final shape of DataFrame:", combined_df.shape)
    return combined_df



df_cleaned = data_pipeline(combined_df, 'Title')
print("Cleaned DataFrame:")
print(df_cleaned)


Starting data pipeline
Before drop_nan: (7758, 4)
After drop_nan: (7758, 3)
After drop_row_nan: (5136, 3)
Before drop_duplicates: (5136, 3)
After drop_duplicates: (4291, 3)
Converting text to lowercase.
Cleaning special characters.
Removing bullets.


  return combined_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Final shape of DataFrame: (3677, 3)
Cleaned DataFrame:
                                                  Title  \
0                          a return to the black forest   
1                             aberdeen butteries rowies   
2                                    ackee and saltfish   
3     acqua e sale puglian bread salad with marinate...   
4            acqua pazza with hake, fennel and tomatoes   
5                                   adaptable bean soup   
6                    adjaruli khachapuri georgian bread   
7                                              affogato   
8                              afternoon tea sandwiches   
9                                                  aoli   
10          air fryer apple, pear and raspberry crumble   
11                           air fryer appletopped cake   
12                       air fryer aubergine parmigiana   
13                                 air fryer baked eggs   
14                               air fryer baked potato   
1

In [16]:
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.max_rows', None)     
pd.set_option('display.max_columns', None)

In [13]:
df_cleaned.head()

Unnamed: 0,Title,Ingredients,Instructions
0,a return to the black forest,225g/8oz dark chocolate 75 per cent cocoa soli...,"preheat the oven to 180c/350f/gas 4., break th..."
1,aberdeen butteries rowies,"500g/1lb 2oz strong plain flour, plus extra fo...","in a large bowl, mix together the flour, yeast..."
2,ackee and saltfish,"1 tbsp vegetable oil, 2 onions, thinly sliced,...",for the roasted peppers and onions if using. p...
3,acqua e sale puglian bread salad with marinate...,"2 red onions, finely sliced, 2 tsp soft brown ...","toss the onions with the sugar, vinegar and sa..."
4,"acqua pazza with hake, fennel and tomatoes","2 hake fillets about 150g/5oz each, skin on an...",in a large frying pan add a splash of olive oi...


In [12]:

# row_labels = [4541, 2973, 2996]  # Replace with actual labels
# print(df_sampled.loc[row_labels])

In [18]:

df_finale = pd.DataFrame(df_cleaned)

# Save the DataFrame as a CSV file
df_finale.to_csv("TEST_DEC_data.csv", index=False)  # Set index=False to avoid saving the index column

print("DataFrame saved as 'TEST_DEC_data.csv'")

DataFrame saved as 'TEST_DEC_data.csv'


In [None]:
from spellchecker import SpellChecker

def calculate_spelling_accuracy(text_file):

    spell = SpellChecker()

    with open(text_file, 'r', encoding='utf-8') as f:
        text_content = f.read()

    words = text_content.split()

    misspelled = spell.unknown(words)

    total_words = len(words)
    correct_words = total_words - len(misspelled)
    accuracy = correct_words / total_words if total_words > 0 else 0

    print(f"Total Words: {total_words}")
    print(f"Correct Words: {correct_words}")
    print(f"Misspelled Words: {len(misspelled)}")
    print(f"Spelling Accuracy: {accuracy:.2%}")

    return accuracy

text_file = '../pipeline/TEST_DEC_data.csv'

calculate_spelling_accuracy(text_file)

Total Words: 957071
Correct Words: 940698
Misspelled Words: 16373
Spelling Accuracy: 98.29%


0.9828925962650629