In [2]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# List of words to ignore (measurements, quantities, descriptors)
stop_words = ['cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
              'according', 'taste', 'whole', 'cleaned', 'washed', 'nicely', 'finely',
              'chopped', 'and', 'or', 'diced', 'peeled', 'for', 'optional', 'adjust',
              'made', 'into', 'paste', 'size', 'variety', 'generous', 'pinch', 'small', 'grams', 'inch']

# Function to extract main ingredient names
def extract_main_ingredients(ingredient_string):
    # Process the text with spaCy NLP
    doc = nlp(ingredient_string)
    
    # List to store the main ingredient names
    ingredients = []
    
    # Track when parentheses open
    inside_parenthesis = False
    current_ingredient = []

    # Loop through tokens and extract only relevant nouns (ignoring stop words)
    for token in doc:
        # If the token is an opening parenthesis, start skipping text
        if token.text == "(":
            inside_parenthesis = True
            continue
        # If the token is a closing parenthesis, stop skipping text
        elif token.text == ")":
            inside_parenthesis = False
            continue
        
        # Only add words to current ingredient if not inside parentheses and relevant noun
        if not inside_parenthesis and token.pos_ in ['NOUN', 'PROPN', 'ADJ'] and token.text.lower() not in stop_words and not token.is_digit:
            current_ingredient.append(token.text)
        
        # If we're not inside parentheses and reach punctuation, consider it the end of an ingredient
        if token.is_punct and current_ingredient:
            ingredients.append(" ".join(current_ingredient))
            current_ingredient = []

    # Add the last ingredient if not added yet
    if current_ingredient:
        ingredients.append(" ".join(current_ingredient))

    # Remove duplicates
    unique_ingredients = list(dict.fromkeys(ingredients))
    
    return ', '.join(unique_ingredients)

# List of input ingredient strings
ingredient_lists = [
    '4 cups Indian borage (Doddapatre) , cleaned and washed 1/2 cup Curd (Dahi / Yogurt) 1 tablespoon Cumin seeds (Jeera) 1/2 teaspoon Whole Black Peppercorns 1/4 cup Dessicated Coconut Salt , to taste 2 tablespoons Ghee',
    '1 Fish , nicely washed with bones on (I used Pompano) For marination 2 tablespoons Curd (Dahi / Yogurt) 1/4 cup Onions , finely chopped 2 Green Chillies , chopped 1 tablespoon Tandoori masala 1 teaspoon Ginger Garlic Paste 1 teaspoon Red Chilli powder 1 teaspoon Garam masala powder 1 teaspoon Coriander Powder (Dhania) 1/2 teaspoon Cumin powder (Jeera) 1 tablespoon Coriander (Dhania) Leaves , finely chopped 1 tablespoon Oil Salt , to taste',
    '300 grams Colocasia root (Arbi) 1 Green Bell Pepper (Capsicum) , diced 1 Tomato , chopped 1 inch Ginger , grated 1 teaspoon Coriander Powder (Dhania) 1/2 teaspoon Turmeric powder (Haldi) 1/2 teaspoon Red Chilli powder 1 teaspoon Garam masala powder Salt , according to taste Oil , for cooking '
    # Add more ingredients here as needed
]

# Process each ingredient list and print the main ingredients
for ingredients_string in ingredient_lists:
    main_ingredients = extract_main_ingredients(ingredients_string)
    print(main_ingredients)


Indian borage, Curd, Cumin seeds Black Peppercorns Coconut Salt, Ghee
Fish, bones marination Curd, Onions, Green Chillies, Tandoori masala Ginger Garlic Red Chilli powder Garam masala powder Coriander Powder Cumin powder Coriander Leaves, Oil Salt
Colocasia root Green Bell Pepper, Tomato, Ginger, Coriander Powder Turmeric powder Red Chilli powder Garam masala powder Salt, Oil
