# Améliorez le produit IA de votre start-up

In [1]:
# basic libs
import os
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import operator
import warnings
# text 
from nltk.stem.snowball import EnglishStemmer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# image
import cv2
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input

warnings.simplefilter(action='ignore', category=FutureWarning)

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

# nltk.download()

debug = 1

if debug:
    nrows = 1000
else:
    nrows = None
    
filename = './yelp_dataset/yelp_academic_dataset_review.json'

### Comments analysis Baseline

In [2]:
stemmer = EnglishStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
stop_words = [stemmer.stem(w) for w in list(nltk.corpus.stopwords.words('english'))]

def clean_up(text):
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

In [3]:
def get_negative_comments(comments):
    negative_comments = comments[comments["stars"]<3]["text"].to_list()

    for comment_id in range(len(negative_comments)):
        negative_comments[comment_id] = negative_comments[comment_id].replace('\n', ' ')
        
    return negative_comments

In [4]:
comments = pd.read_json(filename, lines=True, nrows=nrows)

negative_comments = get_negative_comments(comments)

tokenized_text = [*map(clean_up, negative_comments)]

In [5]:
freqs = {}
for comment in tokenized_text:
    for word in list(set(comment)):
        if word in freqs:
            freqs[word] += 1
        else:
            freqs[word] = 1

freqs_list = [(x, freqs[x]/nrows)for x in freqs]
freqs_list = sorted(freqs_list, key=lambda x:x[1])[::-1]

In [6]:
# print(freqs_list[:500])

n = 200 # deleting the 200 most used words

n_frequent = [freqs_list[x][0]for x in range(n)]

def clear_words(tokens):
    return [w for w in tokens if not w in n_frequent]

tokenized_text = [*map(clear_words, tokenized_text)]

In [7]:
text = [' '.join(comment) for comment in tokenized_text]

In [8]:
n_topics = 10

vectoriser = CountVectorizer(max_df=.6, min_df=10, max_features=100)
vectorized = vectoriser.fit_transform(text)

lda = LatentDirichletAllocation(
        n_components=n_topics, 
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=42).fit(vectorized)


In [9]:
def display_topics(model, feature_names, ntw): # do multiple runs and hyperparameters optimization
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-ntw - 1:-1]]))

n_top_words = 10
display_topics(lda, vectoriser.get_feature_names(), n_top_words)

Topic 0:
hard fix drive yelp pick wrong sorri owner problem treat
Topic 1:
sandwich sauc bread appet plate dinner item chang bring arriv
Topic 2:
okay kid waiter almost bread past drive quick half sat
Topic 3:
poor door night total absolut understand probabl 20 decent may
Topic 4:
free cook enough clean kid home park wast select bread
Topic 5:
phone let decent later item clear store issu sat problem
Topic 6:
shop item wish move horribl store bring mean wrong month
Topic 7:
card clean credit wast care ago sat dinner send store
Topic 8:
care across overal point matter recent may yelp soon arriv
Topic 9:
employe dri notic item gave clear respons woman home matter


In [10]:
features_names = vectoriser.get_feature_names()

In [11]:
words_df = pd.DataFrame(columns=features_names, data=vectorized.toarray())
for col_name in features_names:
    words_df[col_name] = words_df[col_name].astype('bool')
words_df.head()

Unnamed: 0,20,absolut,across,ago,almost,appet,arriv,base,bit,bland,...,treat,understand,waiter,wast,wish,without,woman,write,wrong,yelp
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False


### Picture analysis baseline

In [12]:
nrows = 100

AUTOTUNE = tf.data.AUTOTUNE

In [26]:
def get_path(name):
    return os.path.join("preprocessed_imgs", f"{name}.jpg")

def load_img(path, tgt):
    img = tf.io.read_file(filename=path[0])
    img = tf.io.decode_jpeg(img)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = preprocess_input(img)
    return img, tgt

In [14]:
pictures = pd.read_json("photos.json", lines=True, nrows=nrows)[["photo_id", "label"]]
pictures["path"] = pictures["photo_id"].apply(get_path)
pictures = pictures[["path", "label"]]
target = pictures.pop('label')

In [17]:
pictures.head()

Unnamed: 0,path
0,preprocessed_imgs\Un_Og6jfhazVn7CxszkKEw.jpg
1,preprocessed_imgs\BFE1AFOs27scnnfeBf99ZA.jpg
2,preprocessed_imgs\7t-C0r1JRdoVD9FS7M-N7Q.jpg
3,preprocessed_imgs\rLnw0d-YYZvT9kR4y7h7_Q.jpg
4,preprocessed_imgs\Cv5M8MDw8a5NEWvw2AQ4nw.jpg


In [27]:
load_img(['preprocessed_imgs/Un_Og6jfhazVn7CxszkKEw.jpg'], None)

(<tf.Tensor: shape=(224, 224, 3), dtype=float32, numpy=
 array([[[-103.472336, -116.27704 , -122.930984],
         [-103.48018 , -116.2888  , -122.927055],
         [-103.44881 , -116.26135 , -122.876076],
         ...,
         [-103.260574, -116.03783 , -122.88784 ],
         [-103.19391 , -115.97115 , -122.80941 ],
         [-103.18606 , -115.96331 , -122.80157 ]],
 
        [[-103.44881 , -116.26135 , -122.89176 ],
         [-103.45273 , -116.269196, -122.89176 ],
         [-103.42528 , -116.24567 , -122.83686 ],
         ...,
         [-103.27626 , -116.04959 , -122.899605],
         [-103.20175 , -115.979   , -122.81725 ],
         [-103.18999 , -115.96723 , -122.80549 ]],
 
        [[-103.38606 , -116.214294, -122.797646],
         [-103.393906, -116.22606 , -122.797646],
         [-103.37822 , -116.21037 , -122.76627 ],
         ...,
         [-103.28018 , -116.057434, -122.89568 ],
         [-103.20567 , -115.98292 , -122.81725 ],
         [-103.20175 , -115.97115 , -122.80549

In [28]:
ds = tf.data.Dataset.from_tensor_slices((pictures.values, target.values))
ds = ds.map(load_img, num_parallel_calls=AUTOTUNE)

In [31]:
def run_model():
    model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

    x = model.output

    predictions = Dense(5, activation='softmax')(x)

    new_model = Model(inputs=model.input, outputs=predictions)

    for layer in model.layers[:5]:
        layer.trainable = False

    new_model.compile(loss="sparse_categorical_crossentropy", optimizer=SGD(learning_rate=0.001, momentum=0.9), metrics=["accuracy"])
    
    model_info = new_model.fit(ds, epochs=10, batch_size=64, verbose=2)
    
run_model()

Epoch 1/10


ValueError: in user code:

    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\keras\engine\training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\keras\engine\training.py:838 run_step  **
        outputs = model.train_step(data)
    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\keras\engine\training.py:795 train_step
        y_pred = self(x, training=True)
    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:1013 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\Annie\anaconda3\envs\deeplearning\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:267 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) +

    ValueError: Input 0 is incompatible with layer model_3: expected shape=(None, 224, 224, 3), found shape=(None, None, 3)
