# Améliorez le produit IA de votre start-up

In [41]:
# basic libs
import os
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import operator
import warnings
# text 
from nltk.stem.snowball import EnglishStemmer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# image
import tensorflow as tf

# tf.compat.v1.disable_eager_execution()

warnings.simplefilter(action='ignore', category=FutureWarning)

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

# nltk.download()

debug = 1

if debug:
    nrows = 1000
else:
    nrows = None
    
filename = './yelp_dataset/yelp_academic_dataset_review.json'

batch_size = 100

### Comments analysis Baseline

In [42]:
stemmer = EnglishStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
stop_words = [stemmer.stem(w) for w in list(nltk.corpus.stopwords.words('english'))]

def clean_up(text):
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

In [43]:
def get_negative_comments(comments):
    negative_comments = comments[comments["stars"]<3]["text"].to_list()

    for comment_id in range(len(negative_comments)):
        negative_comments[comment_id] = negative_comments[comment_id].replace('\n', ' ')
        
    return negative_comments

In [44]:
comments = pd.read_json(filename, lines=True, nrows=nrows)

negative_comments = get_negative_comments(comments)

tokenized_text = [*map(clean_up, negative_comments)]

In [45]:
freqs = {}
for comment in tokenized_text:
    for word in list(set(comment)):
        if word in freqs:
            freqs[word] += 1
        else:
            freqs[word] = 1

freqs_list = [(x, freqs[x]/nrows)for x in freqs]
freqs_list = sorted(freqs_list, key=lambda x:x[1])[::-1]

In [46]:
# print(freqs_list[:500])

n = 200 # deleting the 200 most used words

n_frequent = [freqs_list[x][0]for x in range(n)]

def clear_words(tokens):
    return [w for w in tokens if not w in n_frequent]

tokenized_text = [*map(clear_words, tokenized_text)]

In [47]:
text = [' '.join(comment) for comment in tokenized_text]

In [48]:
n_topics = 10

vectoriser = CountVectorizer(max_df=.6, min_df=10, max_features=100)
vectorized = vectoriser.fit_transform(text)

lda = LatentDirichletAllocation(
        n_components=n_topics, 
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=42).fit(vectorized)


In [49]:
def display_topics(model, feature_names, ntw): # do multiple runs and hyperparameters optimization
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-ntw-1:-1]]))

n_top_words = 10
display_topics(lda, vectoriser.get_feature_names(), n_top_words)

Topic 0:
hard fix drive yelp pick wrong sorri owner problem treat
Topic 1:
sandwich sauc bread appet plate dinner item chang bring arriv
Topic 2:
okay kid waiter almost bread past drive quick half sat
Topic 3:
poor door night total absolut understand probabl 20 decent may
Topic 4:
free cook enough clean kid home park wast select bread
Topic 5:
phone let decent later item clear store issu sat problem
Topic 6:
shop item wish move horribl store bring mean wrong month
Topic 7:
card clean credit wast care ago sat dinner send store
Topic 8:
care across overal point matter recent may yelp soon arriv
Topic 9:
employe dri notic item gave clear respons woman home matter


In [50]:
features_names = vectoriser.get_feature_names()

In [51]:
words_df = pd.DataFrame(columns=features_names, data=vectorized.toarray())
for col_name in features_names:
    words_df[col_name] = words_df[col_name].astype('bool')
words_df.head()

Unnamed: 0,20,absolut,across,ago,almost,appet,arriv,base,bit,bland,...,treat,understand,waiter,wast,wish,without,woman,write,wrong,yelp
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False


### Picture analysis baseline

In [52]:
nrows = None

AUTOTUNE = tf.data.AUTOTUNE

In [53]:
def get_path(name):
    return os.path.join("preprocessed_imgs", f"{name}.jpg")

def load_img(path, tgt):
    img = tf.io.read_file(path)
    img = tf.io.decode_jpeg(img)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.keras.applications.vgg16.preprocess_input(img)
    tgt = tf.reshape(tgt, [1])
    return img, tgt

In [54]:
pictures = pd.read_json("photos.json", lines=True, nrows=nrows)[["photo_id", "label"]]
pictures["path"] = pictures["photo_id"].apply(get_path)
pictures = pictures[["path", "label"]]
pictures = pictures.sample(frac=1)
pictures["label"] = pictures["label"].apply(lambda x:  {'interior': 0, 'outside': 1, 'menu': 2, 'food': 3, 'drink': 4}[x])

pictures = pictures.head(100) # warning

pictures.head()

Unnamed: 0,path,label
193365,preprocessed_imgs\54sA7Q7qqD7DMv1y7Hflpg.jpg,2
137348,preprocessed_imgs\_WPfNzS5f3rzCiivf2gaVw.jpg,1
148766,preprocessed_imgs\zmXyOs99i_1K5eWZQXMDww.jpg,1
100572,preprocessed_imgs\lfqE42tg1a5i-Hbg2r9J6Q.jpg,0
1242,preprocessed_imgs\u0WsndgqhyzsvByprRatAQ.jpg,4


In [55]:
BUFFER_SIZE = 1000
ds = tf.data.Dataset.from_tensor_slices((pictures["path"].values, tf.cast(pictures["label"].values, tf.int32))).shuffle(BUFFER_SIZE)
# ds = tf.data.Dataset.from_tensor_slices((pictures["path"].values, pictures["label"].values)).shuffle(BUFFER_SIZE)

In [56]:
ds = ds.map(load_img, num_parallel_calls=AUTOTUNE)

In [57]:
def set_shapes(image, label):
    image.set_shape((224, 224, 3))
    return image, label

ds = ds.map(set_shapes, num_parallel_calls=AUTOTUNE)

In [58]:
ds = ds.repeat()
ds = ds.batch(batch_size)
ds = ds.prefetch(buffer_size=AUTOTUNE)

In [59]:
def check_errors():
    errors = 0
    for image, label in ds.take(50):
        if image.shape != (224, 224, 3):
            errors += 1
            print(image.shape)
    if not errors:
        print('All good')
        
# check_errors()

In [60]:
def train_model():
    model = tf.keras.applications.vgg16.VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
    x = model.output
    predictions = tf.keras.layers.Dense(5, activation='softmax')(x)
    new_model = tf.keras.Model(inputs=model.input, outputs=predictions)
    for layer in model.layers[:5]:
        layer.trainable = False
    new_model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9), metrics=["accuracy"])
    model_info = new_model.fit(ds, epochs=10, steps_per_epoch=1, verbose=2)
    
    return model_info

train_model()

Epoch 1/10


InvalidArgumentError:  logits and labels must have the same first dimension, got logits shape [4900,5] and labels shape [100]
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at \AppData\Local\Temp/ipykernel_9336/2769659046.py:9) ]] [Op:__inference_train_function_3332]

Function call stack:
train_function
