# Task3 Mining Additional Dish Names

 ## Preprocessing data


In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, MWETokenizer
from gensim.models import word2vec
import string
from gensim.models.phrases import Phrases, Phraser

In [2]:
# function to clean text
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
def text_clean(datas):
    res = []
    for data in datas:
        tmp = ''
        data = data.strip()
        if data != '':
            # Toeknizing paragraph 
            words = word_tokenize(data)
#             print(data)
            # Removing stop words && Removing all punctuation
            for word in words:
                word = word.lower()
                if word not in stop_words and word not in string.punctuation and word[0] not in string.punctuation and word[-1] not in string.punctuation:
                    #stemming words
                    ps.stem(word)
                    tmp += word
                    tmp += ' '
            res.append(tmp.strip())
#             print(word_tokenize(line))
    return res

def get_paras(url):
    res = []
    with open(url, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            if line != "":
                    res.append(line)
    file.close()
    return res

In [3]:
chinese_url = "Category/Chinese.txt"
chinese_reviews = get_paras(chinese_url)

In [4]:
chinese_reviews

['I really like both Chinese restaurants in town.  This one has outstanding crab rangoon.  Love the chicken with snow peas and mushrooms and General Tso Chicken.  Food is always ready in 10 minutes which is accurate.  Good place and they give you free pop.',
 'Above average takeout with friendly staff. The sauce on the pan fried noodle is tasty. Dumplings are quite good.',
 "We order from Chang Jiang often and have never been disappointed.  The menu is huge, and can accomodate anyone's taste buds.  The service is quick, usually ready in 10 minutes.",
 "Good enough for carry-out in McFarland on a cold winter's evening.  Ready on time, never had a wrong order, and they're at least reasonably friendly.  Would be better if they delivered, but not sure there's a huge demand for it in our tiny burg.  They also have a lunch buffet on weekdays for around $6.",
 "Best Chinese food madison! I've tried them all and make the 15 minute drive every week :-)",
 "Awful, awful, awful...  The worst of t

In [5]:
chinese_cleaned = text_clean(chinese_reviews)

In [6]:
chinese_cleaned

['really like chinese restaurants town one outstanding crab rangoon love chicken snow peas mushrooms general tso chicken food always ready 10 minutes accurate good place give free pop',
 'average takeout friendly staff sauce pan fried noodle tasty dumplings quite good',
 'order chang jiang often never disappointed menu huge accomodate anyone taste buds service quick usually ready 10 minutes',
 'good enough carry-out mcfarland cold winter evening ready time never wrong order least reasonably friendly would better delivered sure huge demand tiny burg also lunch buffet weekdays around 6',
 'best chinese food madison tried make 15 minute drive every week',
 'awful awful awful worst worst cookie-cutter chinese restaurants food without flavor menu without imagination staff profound indifference hard tell even',
 'tried local terrible bland chinese food much else say never return wife wanted mention sweet sour chicken battered fried chicken pieces side sauce bowl asked vegetables like picture

## build a corpus for the word2vec model

In [7]:
def build_phrases(sentences):
    phrases = Phrases(sentences, min_count = 5, threshold = 7, progress_per = 1000)
    return Phraser(phrases)

In [18]:
def sentence_to_bi_grams(phrases_model, sentence):
    return ' '.join(phrases_model[sentence])

In [19]:
def sentences_to_bi_grams(n_grams, input_file_name, output_file_name):
    with open(input_file_name, 'r') as input_file_pointer:
        with open(output_file_name, 'w+') as out_file:
            for sentence in get_sentences(input_file_pointer):
                cleaned_sentence = clean_sentence(sentence)
                tokenized_sentence = tokenize(cleaned_sentence)
                parsed_sentence = sentence_to_bi_grams(n_grams, tokenized_sentence)
                out_file.write(parsed_sentence + '\n')

In [8]:
chinese_cleaned_word = []
for sentence in chinese_cleaned:
    sentence = sentence.split(' ')
    chinese_cleaned_word.append(sentence)

In [9]:
model = word2vec.Word2Vec(sentences = chinese_cleaned_word, vector_size=100, window=5, min_count=4, workers=1, epochs = 20)  

In [10]:
vocab_len = len(model.wv)
vocab_len

9412

In [11]:
# rock_idx = model.wv.key_to_index["dish"]

In [12]:
for index, word in enumerate(model.wv.index_to_key):
#     if index == 28:
#         break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

word #0/9412 is food
word #1/9412 is good
word #2/9412 is n't
word #3/9412 is chinese
word #4/9412 is place
word #5/9412 is chicken
word #6/9412 is like
word #7/9412 is rice
word #8/9412 is great
word #9/9412 is service
word #10/9412 is order
word #11/9412 is restaurant
word #12/9412 is soup
word #13/9412 is fried
word #14/9412 is one
word #15/9412 is get
word #16/9412 is really
word #17/9412 is ordered
word #18/9412 is would
word #19/9412 is go
word #20/9412 is time
word #21/9412 is beef
word #22/9412 is sauce
word #23/9412 is back
word #24/9412 is lunch
word #25/9412 is noodles
word #26/9412 is menu
word #27/9412 is also
word #28/9412 is best
word #29/9412 is egg
word #30/9412 is always
word #31/9412 is pork
word #32/9412 is hot
word #33/9412 is dishes
word #34/9412 is dish
word #35/9412 is try
word #36/9412 is got
word #37/9412 is pretty
word #38/9412 is shrimp
word #39/9412 is little
word #40/9412 is nice
word #41/9412 is even
word #42/9412 is us
word #43/9412 is well
word #44/9412

In [13]:
vec_king = model.wv['dish']

In [14]:
model.wv.most_similar_cosmul('cuisine')

[('cuisines', 0.8536666631698608),
 ('fusion', 0.7886561751365662),
 ('province', 0.7885344624519348),
 ('bistro', 0.783573567867279),
 ('genuine', 0.7793936729431152),
 ('regional', 0.7788606286048889),
 ('region', 0.777807354927063),
 ('populated', 0.7716696858406067),
 ('population', 0.7664517164230347),
 ('specializes', 0.7658657431602478)]

In [15]:
# phrases_model.save('phrases_model.txt')
# phrases_model= Phraser.load('phrases_model.txt')

In [16]:
# def sentence_to_bi_grams(phrases_model, sentence):
#     return ' '.join(phrases_model[sentence])

In [17]:
# def sentences_to_bi_grams(n_grams, input_file_name, output_file_name):
#     with open(input_file_name, 'r') as input_file_pointer:
#         with open(output_file_name, 'w+') as out_file:
#             for sentence in get_sentences(input_file_pointer):
#                 cleaned_sentence = clean_sentence(sentence)
#                 tokenized_sentence = tokenize(cleaned_sentence)
#                 parsed_sentence = sentence_to_bi_grams(n_grams, tokenized_sentence)
#                 out_file.write(parsed_sentence + '\n')