In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('indian_food.csv')
df=df.drop(columns=['course','state','region','diet','prep_time','cook_time','flavor_profile'],axis=1)

In [3]:
df.head()

Unnamed: 0,name,ingredients
0,Balu shahi,"Maida flour, yogurt, oil, sugar"
1,Boondi,"Gram flour, ghee, sugar"
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins"
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su..."
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,..."


In [4]:
df.shape

(255, 2)

In [5]:
df['ingredients']

0                        Maida flour, yogurt, oil, sugar
1                                Gram flour, ghee, sugar
2           Carrots, milk, sugar, ghee, cashews, raisins
3      Flour, ghee, kewra, milk, clarified butter, su...
4      Milk powder, plain flour, baking powder, ghee,...
                             ...                        
250              Glutinous rice, black sesame seeds, gur
251    Coconut milk, egg yolks, clarified butter, all...
252    Cottage cheese, dry dates, dried rose petals, ...
253    Milk powder, dry fruits, arrowroot powder, all...
254    Brown rice, fennel seeds, grated coconut, blac...
Name: ingredients, Length: 255, dtype: object

In [6]:
df.ingredients[3]

'Flour, ghee, kewra, milk, clarified butter, sugar, almonds, pistachio, saffron, green cardamom'

# Preprocessing

### Lowecasing

In [7]:
# df.ingredients[3].lower()
df['lowerCase_text']=df['ingredients'].str.lower() #create new column
df.head()

Unnamed: 0,name,ingredients,lowerCase_text
0,Balu shahi,"Maida flour, yogurt, oil, sugar","maida flour, yogurt, oil, sugar"
1,Boondi,"Gram flour, ghee, sugar","gram flour, ghee, sugar"
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins","carrots, milk, sugar, ghee, cashews, raisins"
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...","flour, ghee, kewra, milk, clarified butter, su..."
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...","milk powder, plain flour, baking powder, ghee,..."


### Remove punctuation

In [8]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
data_ingredients=df.ingredients[3]
def remove_punc(text):
    punc_rem="".join([p for p in text if p not in string.punctuation])
    return punc_rem 

print(remove_punc(data_ingredients))


Flour ghee kewra milk clarified butter sugar almonds pistachio saffron green cardamom


### Tokenization

In [10]:
data_ingredients=df.ingredients[3]
from nltk.tokenize import word_tokenize, sent_tokenize
def tokenize_word(text):
      punc_rem="".join([p for p in text if p not in string.punctuation]) 
      token_word=word_tokenize(punc_rem)
      return token_word

print(tokenize_word(data_ingredients))

['Flour', 'ghee', 'kewra', 'milk', 'clarified', 'butter', 'sugar', 'almonds', 'pistachio', 'saffron', 'green', 'cardamom']


### Stop word remove

In [11]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
data_ingredients=df.ingredients[3]
eng_stopwords=stopwords.words('english')
def remove_stop_words(text):
    rem_st_words=[word for word in text if word not in eng_stopwords]
    return rem_st_words
print(remove_stop_words(data_ingredients))

['F', 'l', 'u', 'r', ',', ' ', 'g', 'h', 'e', 'e', ',', ' ', 'k', 'e', 'w', 'r', ',', ' ', 'l', 'k', ',', ' ', 'c', 'l', 'r', 'f', 'e', ' ', 'b', 'u', 'e', 'r', ',', ' ', 'u', 'g', 'r', ',', ' ', 'l', 'n', ',', ' ', 'p', 'c', 'h', ',', ' ', 'f', 'f', 'r', 'n', ',', ' ', 'g', 'r', 'e', 'e', 'n', ' ', 'c', 'r']


### stemming

In [13]:
from nltk.stem.porter import PorterStemmer
word_portemmer=PorterStemmer()

In [14]:
token=tokenize_word(data_ingredients)
def stem_words(text):
    word_stem=[word_portemmer.stem(word) for word in text]
    return word_stem

print(stem_words(token))

['flour', 'ghee', 'kewra', 'milk', 'clarifi', 'butter', 'sugar', 'almond', 'pistachio', 'saffron', 'green', 'cardamom']


### Lemmatizing

In [15]:
from nltk.stem import WordNetLemmatizer
word_lemmatizer=WordNetLemmatizer()

In [16]:
token=tokenize_word(data_ingredients)
stem=stem_words(token)
def lemmatize_word(text):
    word_lemma=[word_lemmatizer.lemmatize(word) for word in text]
    return word_lemma
print(lemmatize_word(stem_words(token)))

['flour', 'ghee', 'kewra', 'milk', 'clarifi', 'butter', 'sugar', 'almond', 'pistachio', 'saffron', 'green', 'cardamom']
