In [2]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# List of words to ignore (measurements, quantities, descriptors)
stop_words = ['cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
              'according', 'taste', 'whole', 'cleaned', 'washed', 'nicely', 'finely',
              'chopped', 'and', 'or', 'diced', 'peeled', 'for', 'optional', 'adjust',
              'made', 'into', 'paste', 'size', 'variety', 'generous', 'pinch', 'small']

# Function to extract main ingredient names
def extract_main_ingredients(ingredient_string):
    # Process the text with spaCy NLP
    doc = nlp(ingredient_string)
    
    # List to store the main ingredient names
    ingredients = []
    
    # Loop through tokens and extract only relevant nouns (ignoring stop words)
    for token in doc:
        # Check if the token is a NOUN or PROPN (proper noun), not in stop words, and not a number
        if token.pos_ in ['NOUN', 'PROPN'] and token.text.lower() not in stop_words and not token.is_digit:
            ingredients.append(token.text)
    
    # Clean parentheses and duplicates
    unique_ingredients = []
    for ingredient in ingredients:
        # If parentheses are present, take the first part (main ingredient)
        if "(" in ingredient or ")" in ingredient:
            ingredient = ingredient.split("(")[0].strip()
        
        # Avoid duplicates and add only cleaned ingredient names
        if ingredient not in unique_ingredients:
            unique_ingredients.append(ingredient)
    
    return ', '.join(unique_ingredients)

# List of input ingredient strings
ingredient_lists = [
    '4 cups Indian borage (Doddapatre) , cleaned and washed 1/2 cup Curd (Dahi / Yogurt) 1 tablespoon Cumin seeds (Jeera) 1/2 teaspoon Whole Black Peppercorns 1/4 cup Dessicated Coconut Salt , to taste 2 tablespoons Ghee',
    '1 Fish , nicely washed with bones on (I used Pompano) For marination 2 tablespoons Curd (Dahi / Yogurt) 1/4 cup Onions , finely chopped 2 Green Chillies , chopped 1 tablespoon Tandoori masala 1 teaspoon Ginger Garlic Paste 1 teaspoon Red Chilli powder 1 teaspoon Garam masala powder 1 teaspoon Coriander Powder (Dhania) 1/2 teaspoon Cumin powder (Jeera) 1 tablespoon Coriander (Dhania) Leaves , finely chopped 1 tablespoon Oil Salt , to taste',
    ' 300 grams Colocasia root (Arbi)  1 Green Bell Pepper (Capsicum) , diced  1 Tomato , chopped  1 inch Ginger , grated  1 teaspoon Coriander Powder (Dhania)  1/2 teaspoon Turmeric powder (Haldi)  1/2 teaspoon Red Chilli powder  1 teaspoon Garam masala powder Salt , according to taste Oil , for cooking '
    # Add more ingredients here as needed
]

# Process each ingredient list and print the main ingredients
for ingredients_string in ingredient_lists:
    main_ingredients = extract_main_ingredients(ingredients_string)
    print(main_ingredients)


borage, Doddapatre, Curd, Dahi, Yogurt, Cumin, seeds, Jeera, Black, Peppercorns, Coconut, Salt, Ghee
Fish, bones, Pompano, marination, Curd, Dahi, Yogurt, Onions, Green, Chillies, Tandoori, masala, Ginger, Garlic, Red, Chilli, powder, Garam, Coriander, Powder, Dhania, Cumin, Jeera, Leaves, Oil, Salt
grams, Colocasia, root, Arbi, Green, Bell, Pepper, Capsicum, Tomato, inch, Ginger, Coriander, Powder, Dhania, powder, Haldi, Red, Chilli, Garam, masala, Salt, Oil
