# Améliorez le produit IA de votre start-up

In [71]:
### basic libs
import os
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
import operator
import warnings
# api
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
# text 
from nltk.stem.snowball import EnglishStemmer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# image
import tensorflow as tf

warnings.simplefilter(action='ignore', category=FutureWarning)

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)



debug = 1

if debug:
    nrows = 1000
else:
    nrows = None
    
filename = './yelp_dataset/yelp_academic_dataset_review.json'

batch_size = 16

### API queries

In [72]:
with open("api.key", "r") as f:
    api_key = f.readline()


header = {'Authorization': f'bearer {api_key}',
         'Content-Type': 'application/json'}

# Select your transport with a defined url endpoint
transport = RequestsHTTPTransport(url="https://api.yelp.com/v3/graphql", headers=header, use_json=True)

# Create a GraphQL client using the defined transport
client = Client(transport=transport, fetch_schema_from_transport=True)

# Provide a GraphQL query
query = gql(
    """
    {
      search(term:"restaurants",
             location:"san francisco") {
        business {
          reviews {
            text
            rating
          }
        }
      }
    }
    """
)

# Execute the query on the transport
result = client.execute(query)

In [73]:
df_reviews = pd.DataFrame(columns=['text', 'rating'])

for business in result["search"]["business"]:
    for review in business["reviews"]:
        df_reviews = df_reviews.append(review, ignore_index=True)

df_reviews.to_csv('new_reviews.csv')

df_reviews.head()

Unnamed: 0,text,rating
0,Our whole family enjoyed Farmhouse Kitchen!\n\...,5
1,"Value is playing a role in my review, I don't ...",3
2,I think this place is overhyped. I've been rea...,2
3,"Food aside, I would go here again for the ador...",5
4,I recently found this place while searching fo...,5


### Comments analysis Baseline

In [74]:
stemmer = EnglishStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
stop_words = [stemmer.stem(w) for w in list(nltk.corpus.stopwords.words('english'))]

def clean_up(text):
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

In [75]:
def get_negative_comments(comments):
    negative_comments = comments[comments["stars"]<3]["text"].to_list()

    for comment_id in range(len(negative_comments)):
        negative_comments[comment_id] = negative_comments[comment_id].replace('\n', ' ')
        
    return negative_comments

In [76]:
comments = pd.read_json(filename, lines=True, nrows=nrows)

negative_comments = get_negative_comments(comments)

tokenized_text = [*map(clean_up, negative_comments)]

In [77]:
freqs = {}
for comment in tokenized_text:
    for word in list(set(comment)):
        if word in freqs:
            freqs[word] += 1
        else:
            freqs[word] = 1

freqs_list = [(x, freqs[x]/nrows)for x in freqs]
freqs_list = sorted(freqs_list, key=lambda x:x[1])[::-1]

In [78]:
# print(freqs_list[:500])

n = 200 # deleting the 200 most used words

n_frequent = [freqs_list[x][0]for x in range(n)]

def clear_words(tokens):
    return [w for w in tokens if not w in n_frequent]

tokenized_text = [*map(clear_words, tokenized_text)]

In [79]:
text = [' '.join(comment) for comment in tokenized_text]

In [80]:
n_topics = 10

vectoriser = CountVectorizer(max_df=.6, min_df=10, max_features=100)
vectorized = vectoriser.fit_transform(text)

lda = LatentDirichletAllocation(
        n_components=n_topics, 
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=42).fit(vectorized)


In [81]:
def display_topics(model, feature_names, ntw): # do multiple runs and hyperparameters optimization
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-ntw-1:-1]]))

n_top_words = 10
display_topics(lda, vectoriser.get_feature_names(), n_top_words)

Topic 0:
hard fix drive yelp pick wrong sorri owner problem treat
Topic 1:
sandwich sauc bread appet plate dinner item chang bring arriv
Topic 2:
okay kid waiter almost bread past drive quick half sat
Topic 3:
poor door night total absolut understand probabl 20 decent may
Topic 4:
free cook enough clean kid home park wast select bread
Topic 5:
phone let decent later item clear store issu sat problem
Topic 6:
shop item wish move horribl store bring mean wrong month
Topic 7:
card clean credit wast care ago sat dinner send store
Topic 8:
care across overal point matter recent may yelp soon arriv
Topic 9:
employe dri notic item gave clear respons woman home matter


In [82]:
features_names = vectoriser.get_feature_names()

In [83]:
words_df = pd.DataFrame(columns=features_names, data=vectorized.toarray())
for col_name in features_names:
    words_df[col_name] = words_df[col_name].astype('bool')
words_df.head()

Unnamed: 0,20,absolut,across,ago,almost,appet,arriv,base,bit,bland,...,treat,understand,waiter,wast,wish,without,woman,write,wrong,yelp
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False


### Picture analysis baseline

In [84]:
nrows = None

AUTOTUNE = tf.data.AUTOTUNE

In [85]:
def get_path(name):
    return os.path.join("preprocessed_imgs", f"{name}.jpg")

def load_img(path, tgt):
    img = tf.io.read_file(path)
    img = tf.io.decode_jpeg(img)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.keras.applications.vgg16.preprocess_input(img)
    tgt = tf.reshape(tgt, [1])
    return img, tgt

In [86]:
pictures = pd.read_json("photos.json", lines=True, nrows=nrows)[["photo_id", "label"]]
pictures["path"] = pictures["photo_id"].apply(get_path)
pictures = pictures[["path", "label"]]
pictures = pictures.sample(frac=1) # shuffle
pictures["label"] = pictures["label"].apply(lambda x:  {'interior': 0, 'outside': 1, 'menu': 2, 'food': 3, 'drink': 4}[x])

pictures = pictures.head(100) # warning

pictures.head()

Unnamed: 0,path,label
113204,preprocessed_imgs\pktBAPWe9XbDGIfyejuj-A.jpg,0
47646,preprocessed_imgs\M9-HDkj4o_vCOJFp_6F_3Q.jpg,3
94950,preprocessed_imgs\eaQS-wZl6TQGWYrdc4K0JQ.jpg,0
73564,preprocessed_imgs\6-zZR29_OcHg0-8_6e7BLg.jpg,3
74545,preprocessed_imgs\ycH4ILdISi1XM2z_GTpWFA.jpg,3


In [87]:
BUFFER_SIZE = 1000
ds = tf.data.Dataset.from_tensor_slices((pictures["path"].values, tf.cast(pictures["label"].values, tf.int32))).shuffle(BUFFER_SIZE)

In [88]:
ds = ds.map(load_img, num_parallel_calls=AUTOTUNE)
ds = ds.repeat()
ds = ds.batch(batch_size)
ds = ds.prefetch(buffer_size=AUTOTUNE)

In [89]:
model = tf.keras.applications.vgg16.VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

x = model.output
predictions = tf.keras.layers.Flatten()(x)
new_model = tf.keras.Model(inputs=model.input, outputs=predictions)


img = ds.take(1)
# pred = model.predict(img)
print(img)
# print(pred)

<TakeDataset shapes: ((None, None, None, 3), (None, 1)), types: (tf.float32, tf.int32)>


In [90]:
# print(pred)

In [93]:
def train_model():
    model = tf.keras.applications.vgg16.VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
    x = model.output
    x = tf.keras.layers.Flatten()(x)
    predictions = tf.keras.layers.Dense(5, activation='softmax')(x)
    new_model = tf.keras.Model(inputs=model.input, outputs=predictions)
    for layer in model.layers[:5]:
        layer.trainable = False
    new_model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9), metrics=["accuracy"])
    print(new_model.summary())
    model_info = new_model.fit(ds, batch_size=batch_size, epochs=10, steps_per_epoch=256, verbose=2)
    
    return model_info

model = train_model()

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0  

<keras.callbacks.History at 0x18b31469400>