# Améliorez le produit IA de votre start-up

## Traitement des commentaires

In [1]:
# basic libs
import os
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import operator
import warnings
# text 
from nltk.stem.snowball import EnglishStemmer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# image
import cv2
# import tensorflow as tf
from tensorflow import convert_to_tensor 
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.preprocessing.image import ImageDataGenerator

warnings.simplefilter(action='ignore', category=FutureWarning)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

# nltk.download()

debug = 1

if debug:
    nrows = 1000
else:
    nrows = None
    
filename = './yelp_dataset/yelp_academic_dataset_review.json'

### Comments analysis Baseline

In [2]:
stemmer = EnglishStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
stop_words = [stemmer.stem(w) for w in list(nltk.corpus.stopwords.words('english'))]

def clean_up(text):
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

In [3]:
def get_negative_comments(comments):
    negative_comments = comments[comments["stars"]<3]["text"].to_list()

    for comment_id in range(len(negative_comments)):
        negative_comments[comment_id] = negative_comments[comment_id].replace('\n', ' ')
        
    return negative_comments

In [4]:
comments = pd.read_json(filename, lines=True, nrows=nrows)

negative_comments = get_negative_comments(comments)

tokenized_text = [*map(clean_up, negative_comments)]

In [5]:
freqs = {}
for comment in tokenized_text:
    for word in list(set(comment)):
        if word in freqs:
            freqs[word] += 1
        else:
            freqs[word] = 1

freqs_list = [(x, freqs[x]/nrows)for x in freqs]
freqs_list = sorted(freqs_list, key=lambda x:x[1])[::-1]

In [6]:
# print(freqs_list[:500])

n = 200 # deleting the 200 most used words

n_frequent = [freqs_list[x][0]for x in range(n)]

def clear_words(tokens):
    return [w for w in tokens if not w in n_frequent]

tokenized_text = [*map(clear_words, tokenized_text)]

In [7]:
text = [' '.join(comment) for comment in tokenized_text]

In [8]:
n_topics = 10

tf_vectoriser = CountVectorizer(max_df=.6, min_df=10, max_features=100)
tf = tf_vectoriser.fit_transform(text)

lda = LatentDirichletAllocation(
        n_components=n_topics, 
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=42).fit(tf)


In [9]:
def display_topics(model, feature_names, ntw): # do multiple runs and hyperparameters optimization
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-ntw - 1:-1]]))

n_top_words = 10
display_topics(lda, tf_vectoriser.get_feature_names(), n_top_words)

Topic 0:
hard fix drive yelp pick wrong sorri owner problem treat
Topic 1:
sandwich sauc bread appet plate dinner item chang bring arriv
Topic 2:
okay kid waiter almost bread past drive quick half sat
Topic 3:
poor door night total absolut understand probabl 20 decent may
Topic 4:
free cook enough clean kid home park wast select bread
Topic 5:
phone let decent later item clear store issu sat problem
Topic 6:
shop item wish move horribl store bring mean wrong month
Topic 7:
card clean credit wast care ago sat dinner send store
Topic 8:
care across overal point matter recent may yelp soon arriv
Topic 9:
employe dri notic item gave clear respons woman home matter


In [10]:
features_names = tf_vectoriser.get_feature_names()

In [11]:
words_df = pd.DataFrame(columns=features_names, data=tf.toarray())
for col_name in features_names:
    words_df[col_name] = words_df[col_name].astype('bool')
words_df.head()

Unnamed: 0,20,absolut,across,ago,almost,appet,arriv,base,bit,bland,...,treat,understand,waiter,wast,wish,without,woman,write,wrong,yelp
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False


### Picture analysis baseline

In [12]:
nrows = 100

In [13]:
pictures = pd.read_json("photos.json", lines=True, nrows=nrows)[["photo_id", "label"]]

pictures.head()

Unnamed: 0,photo_id,label
0,Un_Og6jfhazVn7CxszkKEw,drink
1,BFE1AFOs27scnnfeBf99ZA,drink
2,7t-C0r1JRdoVD9FS7M-N7Q,drink
3,rLnw0d-YYZvT9kR4y7h7_Q,drink
4,Cv5M8MDw8a5NEWvw2AQ4nw,drink


In [14]:
def loader_normaliser(name):
    image = cv2.imread(os.path.join("yelp_photos", f"{name}.jpg"))
    image = cv2.resize(image, (224, 224), interpolation=cv2.INTER_LINEAR)
    image = cv2.GaussianBlur(image, (5, 5), 0)
    
    return convert_to_tensor(np.asarray(image).astype(np.float32))
    # return convert_to_tensor(image)

In [16]:
pictures["picture"] = pictures["photo_id"].apply(loader_normaliser)

pictures = pictures[["picture", "label"]]

X_train, X_test, y_train, y_test = train_test_split(pictures["picture"], pictures['label'], random_state=42, test_size=0.2)

pictures.head()

Unnamed: 0,picture,label
0,"(((tf.Tensor(27.0, shape=(), dtype=float32), t...",drink
1,"(((tf.Tensor(154.0, shape=(), dtype=float32), ...",drink
2,"(((tf.Tensor(76.0, shape=(), dtype=float32), t...",drink
3,"(((tf.Tensor(246.0, shape=(), dtype=float32), ...",drink
4,"(((tf.Tensor(62.0, shape=(), dtype=float32), t...",drink


In [17]:
model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

x = model.output

predictions = Dense(10, activation='softmax')(x)

new_model = Model(inputs=model.input, outputs=predictions)

for layer in model.layers[:5]:
    layer.trainable = False
    
new_model.compile(loss="categorical_crossentropy", optimizer=SGD(learning_rate=0.001, momentum=0.9), metrics=["accuracy"])

model_info = new_model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=2)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type tensorflow.python.framework.ops.EagerTensor).