# Améliorez le produit IA de votre start-up

## Traitement des commentaires

In [1]:
import os
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

# nltk.download()

debug = 1

if debug:
    nrows = 500
else:
    nrows = None

In [2]:
filename = './yelp_dataset/yelp_academic_dataset_review.json' 
comments = pd.read_json(filename, lines=True, nrows=nrows)

In [3]:
comments.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4,3,1,1,"Apparently Prides Osteria had a rough summer as evidenced by the almost empty dining room at 6:30 on a Friday night. However new blood in the kitchen seems to have revitalized the food from other customers recent visits. Waitstaff was warm but unobtrusive. By 8 pm or so when we left the bar was full and the dining room was much more lively than it had been. Perhaps Beverly residents prefer a later seating. \n\nAfter reading the mixed reviews of late I was a little tentative over our choice but luckily there was nothing to worry about in the food department. We started with the fried dough, burrata and prosciutto which were all lovely. Then although they don't offer half portions of pasta we each ordered the entree size and split them. We chose the tagliatelle bolognese and a four cheese filled pasta in a creamy sauce with bacon, asparagus and grana frita. Both were very good. We split a secondi which was the special Berkshire pork secreto, which was described as a pork skirt steak with garlic potato purée and romanesco broccoli (incorrectly described as a romanesco sauce). Some tables received bread before the meal but for some reason we did not. \n\nManagement also seems capable for when the tenants in the apartment above began playing basketball she intervened and also comped the tables a dessert. We ordered the apple dumpling with gelato and it was also quite tasty. Portions are not huge which I particularly like because I prefer to order courses. If you are someone who orders just a meal you may leave hungry depending on you appetite. Dining room was mostly younger crowd while the bar was definitely the over 40 set. Would recommend that the naysayers return to see the improvement although I personally don't know the former glory to be able to compare. Easy access to downtown Salem without the crowds on this month of October.",2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4,1,0,0,"This store is pretty good. Not as great as Walmart (or my preferred, Milford Target), but closer and in a easier area to get to. \nThe store itself is pretty clean and organized, the staff are friendly (most of the time), and BEST of all is the Self Checkout this store has! \nGreat clearance sections throughout, and great prices on everything in the store, in general (they pricematch too!). \nChristian, Debbie, Jen and Hanna are all very friendly, helpful, sensitive to all customer needs. Definitely one of the better Target locations in the area, and they do a GREAT job assisting customers for being such a busy store. Located directly in the Framingham Mall on Cochituate Rd / Route 30. 4 stars.",2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5,0,0,0,"I called WVM on the recommendation of a couple of friends who had used them in the past and thought they did a nice job. I'm a fan now, too.\n\nEvan and Cody showed up right on time for my move this past weekend. They were friendly and energetic, working quickly but carefully to get all my things moved out of the old place and into the new one in less than 2.5 hours. All of my (heavy) furniture arrived in perfect condition, and they took extra care not to scratch the wood floors in the process.\n\nI hope not to move again anytime soon, but next time I do, I'll be calling WVM.",2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2,1,1,1,I've stayed at many Marriott and Renaissance Marriott's and this was a huge disappointment! The front desk and atrium is nice..there is a starbucks on site which is nice.\n\nThe rooms are run down and old. There is a flat screen but that is to be expected of a Renaissance.\n\nWe got this hotel via Priceline at a rate of $75/night...good deal for the price but this is not a true Renaissance.,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4,0,0,0,"The food is always great here. The service from both the manager as well as the staff is super. Only draw back of this restaurant is it's super loud. If you can, snag a patio table!",2011-07-28 18:05:01


### Baseline

In [4]:
stemmer = EnglishStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
stop_words = [stemmer.stem(w) for w in list(nltk.corpus.stopwords.words('english'))]

def clean_up(text):
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

# print(clean_up("Hey, I'm an example text for OpenClassrooms course. Please pay attention !"))

In [5]:
tokenized_text = [*map(clean_up, comments["text"].to_list())]

In [6]:
# Create CBOW model
model1 = gensim.models.Word2Vec(tokenized_text, min_count = 1, vector_size = 100, window = 5)
print(model1.wv.similarity('rosebud', 'doorway'))

0.4959898


In [7]:
text = [' '.join(comment) for comment in tokenized_text]
print(text[:10])

['appar pride osteria rough summer evidenc almost empti dine room 6 30 friday night howev new blood kitchen seem revit food custom recent visit waitstaff warm unobtrus 8 pm left bar full dine room much live perhap bever resid prefer later seat read mix review late littl tentat choic luckili noth worri food depart start fri dough burrata prosciutto love although offer half portion pasta order entre size split chose tagliatell bolognes four chees fill pasta creami sauc bacon asparagus grana frita good split secondi special berkshir pork secreto describ pork skirt steak garlic potato puré romanesco broccoli incorrect describ romanesco sauc tabl receiv bread meal reason manag also seem capabl tenant apart began play basketbal interven also comp tabl dessert order appl dumpl gelato also quit tasti portion huge particular like prefer order cours someon order meal may leav hungri depend appetit dine room younger crowd bar definit 40 set would recommend naysay return see improv although person

In [8]:
n_topics = 10

tf_vectoriser = CountVectorizer(max_df=.95, min_df=2, max_features=1000)
tf = tf_vectoriser.fit_transform(text)

lda = LatentDirichletAllocation(
        n_components=n_topics, 
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=42).fit(tf)


In [9]:
def display_topics(model, feature_names, ntw): # multiple runs and hyperparameters optimization are good
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-ntw - 1:-1]]))

n_top_words = 10
display_topics(lda, tf_vectoriser.get_feature_names(), n_top_words)

Topic 0:
place great food good make super staff shop delici fun
Topic 1:
wrap hostess falafel custom servic gravi rude level good yelp
Topic 2:
burger like meat order delici place beef also flavor bacon
Topic 3:
line cleaner red fast anyth us simpli look know compar
Topic 4:
fabric like time help great good walk make one friend
Topic 5:
root need front like go desk realli time place servic
Topic 6:
mention coffe leas order good us food one told never
Topic 7:
time get servic go order like back would need look
Topic 8:
place food good great like go servic time get friend
Topic 9:
ramen place bowl love base biscuit add custom size great
