In [1]:
# Importing required libraries
import pandas as pd
import json
import re
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager  
from string import punctuation
from textblob import TextBlob, Word
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import en_core_web_lg
nlp = en_core_web_lg.load()

## Scraping data from Swiggy

In [2]:
driver = webdriver.Chrome(executable_path='C:\driver\chromedriver.exe')
driver.get('https://www.swiggy.com/restaurants/voosh-thalis-and-bowls-indiranagar-bangalore-365812')

In [3]:
encoded_data = driver.find_elements_by_class_name("_2wg_t")

In [4]:
raw_data = [] 
for data in range(len(encoded_data)):
    raw_data.append(encoded_data[data].text)

In [5]:
desc=[]
Scraped_database = []
for i in range(len(raw_data)):
    dict_1={}
    x = raw_data[i].split("\n",10)
    if 'Bestseller' in x:
        dict_1['Bestseller'] = "Bestseller"
        dict_1['Must Try'] = ""
    else:
        if 'Must Try' in x:
            dict_1['Must Try'] = "Must Try"
            dict_1['Bestseller'] = ""
        else:    
            dict_1['Bestseller'] = ""
            dict_1['Must Try'] = ""
    if 'Bestseller' in x:
        x.remove("Bestseller")
    if "Must Try" in x:
        x.remove("Must Try")
    dict_1['Item_name'] = x[0]
    dict_1['Price'] = x[1]
    if len(x[2]) > 3:
        dict_1['Description'] = x[2]
        desc.append(x[2])
    else: 
        dict_1['Description'] = ""
    Scraped_database.append(dict_1)    

In [6]:
# Extracted infromation of food products from swiggy
# categories available
'''
    tags = Bestseller, Must Try
    Item_name
    Price
    Description
'''

Scraped_database

[{'Bestseller': '',
  'Must Try': '',
  'Item_name': 'Homely Bhindi Masala Thali with Sweet',
  'Price': '189',
  'Description': 'Enjoy a delicious meal with bhindi masala, dal tadka, 2 phulkas, rice and sweet. Amazing for one.'},
 {'Bestseller': '',
  'Must Try': '',
  'Item_name': 'Homely Dal Tadka Thali with Sweet, Roti or Rice',
  'Price': '189',
  'Description': 'Enjoy a delicious meal with dal tadka, dry veggie of the day, 2 phulkas, rice, raita and sweet. Amazing for one.'},
 {'Bestseller': 'Bestseller',
  'Must Try': '',
  'Item_name': 'Homely Paneer Masala Thali with Sweet',
  'Price': '194',
  'Description': 'Enjoy a delicious meal with paneer masala, dal tadka, 2 phulkas, rice and sweet. Amazing for one.'},
 {'Bestseller': 'Bestseller',
  'Must Try': '',
  'Item_name': 'Voosh Variety Thali with a Sweet & Butter Milk',
  'Price': '199',
  'Description': 'Treat yourself with delicious bhindi Masala, aloo gobhi, dal tadka, 2 phulka, rice ,sweet, butter milk. Amazing for one.'},

## Preprocessing data

In [7]:
# text cleaning
desc = re.sub('[^A-Za-z]+', ' ', str(desc).strip())
# remove stop words
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(desc) 
filtered_sentence = [w for w in word_tokens if not w in stop_words] 

filtered_sentence = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
        filtered_sentence.append(w) 
desc = " ".join(filtered_sentence)   
desc

'Enjoy delicious meal bhindi masala dal tadka phulkas rice sweet Amazing one Enjoy delicious meal dal tadka dry veggie day phulkas rice raita sweet Amazing one Enjoy delicious meal paneer masala dal tadka phulkas rice sweet Amazing one Treat delicious bhindi Masala aloo gobhi dal tadka phulka rice sweet butter milk Amazing one Enjoy wholesome thali meal paneer masala dry veggie day dal tadka phulkas rice sweet butter milk Amazing one Enjoy delicious meal Egg masala dal tadka phulkas rice sweet Amazing one Enjoy delicious meal bhindi masala dal tadka phulkas rice sweet Amazing one Enjoy delicious meal dal tadka dry veggie day phulkas rice raita sweet Amazing one Enjoy delicious meal paneer masala dal tadka phulkas rice sweet Amazing one Treat delicious bhindi Masala aloo gobhi dal tadka phulka rice sweet butter milk Amazing one Enjoy wholesome thali meal paneer masala dry veggie day dal tadka phulkas rice sweet butter milk Amazing one Enjoy delicious meal Egg masala dal tadka phulkas ri

## Frequency count

In [8]:
# functions

def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
#     ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
#             word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix["Words_frequency"] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table


def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [9]:
sentences = sent_tokenize(desc) # NLTK function to tokenize string
total_documents = len(sentences) # length of words

In [15]:
# Create the Frequency matrix of the words in each sentence.
Frequent_words = _create_frequency_matrix(sentences)
Frequent_words 

# generate database for visulization
vec=list()
frq=list()
for item in Frequent_words['Words_frequency']:
    vec.append(item)
    frq.append(Frequent_words['Words_frequency'][item])
    
database = pd.DataFrame({"vectors":vec ,"frequency":frq })    

df = database.sort_values(by=['frequency'],ascending=False,ignore_index=True)   #sort data
display(df.head(10))
df.to_excel("database.xlsx",index=False) # save data as excel file

Unnamed: 0,vectors,frequency
0,one,32
1,dal,28
2,meal,25
3,amazing,25
4,masala,22
5,sweet,21
6,rice,20
7,tadka,18
8,enjoy,14
9,phulkas,14


In [11]:
# Calculate TermFrequency and generate a matrix
TermFeq = _create_tf_matrix(Frequent_words)
TermFeq

{'Words_frequency': {'enjoy': 0.10606060606060606,
  'delicious': 0.09090909090909091,
  'meal': 0.1893939393939394,
  'bhindi': 0.06060606060606061,
  'masala': 0.16666666666666666,
  'dal': 0.21212121212121213,
  'tadka': 0.13636363636363635,
  'phulkas': 0.10606060606060606,
  'rice': 0.15151515151515152,
  'sweet': 0.1590909090909091,
  'amazing': 0.1893939393939394,
  'one': 0.24242424242424243,
  'dry': 0.06818181818181818,
  'veggie': 0.05303030303030303,
  'day': 0.05303030303030303,
  'raita': 0.015151515151515152,
  'paneer': 0.07575757575757576,
  'treat': 0.015151515151515152,
  'aloo': 0.06060606060606061,
  'gobhi': 0.045454545454545456,
  'phulka': 0.015151515151515152,
  'butter': 0.06060606060606061,
  'milk': 0.06060606060606061,
  'wholesome': 0.030303030303030304,
  'thali': 0.030303030303030304,
  'egg': 0.06060606060606061,
  'chicken': 0.022727272727272728,
  'onion': 0.045454545454545456,
  'parathas': 0.03787878787878788,
  'curd': 0.030303030303030304,
  'sala

## Top aspects from descriptions

In [12]:
"""Create a list of common words to remove"""

stop_words = stopwords.words('english')   
"""Define a function to extract keywords"""
def get_aspects(x):
    doc=nlp(x) ## Tokenize and extract grammatical components
    doc=[i.text for i in doc if i.text not in stop_words and i.pos_=="NOUN"] ## Remove common words and retain only nouns
    doc=list(map(lambda i: i.lower(),doc)) ## Normalize text to lower case
    doc=pd.Series(doc) 
    doc=doc.value_counts().head(50).index.tolist() ## Get top most frequent nouns
    return doc

corpus = get_aspects(str(desc))

In [14]:
# aspects with Term Frequency score
aspect_name=[]
aspect_score = []
for vec in TermFeq['Words_frequency']:
    if vec in corpus:
        aspect_name.append(vec)
        aspect_score.append(round(TermFeq['Words_frequency'][vec],4))
aspect_df= pd.DataFrame({'Item_name': aspect_name,'Item_Score':aspect_score})
aspect_df = aspect_df.sort_values(by=['Item_Score'],ascending=False,ignore_index=True)
display(aspect_df)
aspect_df.to_excel("aspects.xlsx",index=False) # save as excel

Unnamed: 0,Item_name,Item_Score
0,one,0.2424
1,dal,0.2121
2,meal,0.1894
3,masala,0.1667
4,rice,0.1515
5,tadka,0.1364
6,paneer,0.0758
7,salad,0.0682
8,aloo,0.0606
9,bhindi,0.0606


In [422]:
# # Creating a table for documents per words
# IDF_matrix = _create_documents_per_words(TermFeq)
# IDF_matrix 

In [423]:
# _create_idf_matrix(f2,f3,total_documents)