In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

In [2]:
from google.colab import drive
drive.mount('/content/drive/')
! cp -r --verbose '/content/drive/MyDrive/ESEO/I3/S9/Langage_naturel/TP_issou/dataset_cleaned.csv' .
DATASET_FILE = './dataset_cleaned.csv'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
'/content/drive/MyDrive/ESEO/I3/S9/Langage_naturel/TP_issou/dataset_cleaned.csv' -> './dataset_cleaned.csv'


In [3]:
dataset_df = pd.read_csv(DATASET_FILE)
dataset_df

Unnamed: 0,text,stars,length,text_cleaned
0,I've only had food from here once and it wasn'...,1,68,food memorable panang curry balance flavor lik...
1,I will never return here again. Ever. I was ...,1,87,NOT_return ever sit booth wait dinner come scu...
2,I wish my experience was great as others. I di...,1,166,wish experience great others din wednesday nig...
3,Are the rosemary grapefruit scones supposed to...,1,81,rosemary grapefruit scone suppose taste like w...
4,Our takeout order was half wrong. Food was mis...,1,32,takeout order half wrong food miss portion siz...
...,...,...,...,...
24995,I was a loyal fan of Aroy before the ownership...,5,75,loyal fan aroy ownership change apprehensive v...
24996,Stopped here for a bite while wandering around...,5,55,stopped bite wander around faneuil hall pleasa...
24997,"A quiet place with excellent food, great music...",5,32,quiet place excellent food great music helpful...
24998,Super delicious food. Awesome vibe. I suffered...,5,41,super delicious food awesome vibe suffer disne...


In [4]:
text_clean = dataset_df.text_cleaned
text_clean

0        food memorable panang curry balance flavor lik...
1        NOT_return ever sit booth wait dinner come scu...
2        wish experience great others din wednesday nig...
3        rosemary grapefruit scone suppose taste like w...
4        takeout order half wrong food miss portion siz...
                               ...                        
24995    loyal fan aroy ownership change apprehensive v...
24996    stopped bite wander around faneuil hall pleasa...
24997    quiet place excellent food great music helpful...
24998    super delicious food awesome vibe suffer disne...
24999    lot dietary restriction place spot superfood s...
Name: text_cleaned, Length: 25000, dtype: object

In [5]:
# separate the low stars from the other to keep working on the bad reviews
df12 = dataset_df[dataset_df.stars.isin([1,2])]

In [6]:
vectorizer = TfidfVectorizer(min_df=0.075, max_df=0.36) #max_df to keep the 'taste' and 'service'
X = vectorizer.fit_transform(df12.text_cleaned)
XX = vectorizer.get_feature_names_out()
print(X.shape)
print(XX)

(10000, 97)
['10' 'also' 'another' 'around' 'ask' 'back' 'bad' 'bar' 'best' 'bring'
 'call' 'check' 'chicken' 'cold' 'come' 'could' 'customer' 'day' 'dinner'
 'dish' 'drink' 'eat' 'even' 'ever' 'experience' 'find' 'first' 'friend'
 'fry' 'give' 'great' 'hour' 'however' 'know' 'last' 'leave' 'like'
 'little' 'location' 'long' 'look' 'make' 'manager' 'many' 'meal' 'menu'
 'minute' 'much' 'need' 'next' 'nice' 'night' 'nothing' 'ok' 'one' 'pay'
 'people' 'pretty' 'price' 'quality' 'really' 'restaurant' 'review'
 'right' 'sauce' 'say' 'seat' 'see' 'seem' 'serve' 'server' 'service'
 'sit' 'small' 'something' 'staff' 'star' 'still' 'table' 'take' 'taste'
 'tell' 'thing' 'think' 'though' 'time' 'try' 'two' 'use' 'wait'
 'waitress' 'walk' 'want' 'way' 'well' 'work' 'would']


In [7]:
from nltk import word_tokenize, pos_tag
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
# Noun extract and lemmatize function
def nouns(text):
    '''Given a string of text, tokenize the text 
    and pull out only the nouns.'''
    # create mask to isolate words that are nouns
    is_noun = lambda pos: pos[:2] in ['NN','VB']
    # store function to split string of words 
    # into a list of words (tokens)
    tokenized = word_tokenize(text)
    # store function to lemmatize each word
    wordnet_lemmatizer = WordNetLemmatizer()
    # use list comprehension to lemmatize all words 
    # and create a list of all nouns
    all_nouns = [wordnet_lemmatizer.lemmatize(word) for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    #return string of joined list of nouns
    return ' '.join(all_nouns)

# Create dataframe of only nouns from speeches
data_nouns = pd.DataFrame(df12.text_cleaned.apply(nouns))
# Visually Inspect
data_nouns.head()

Unnamed: 0,text_cleaned
0,food panang curry balance flavor taste coconut...
1,NOT_return sit wait dinner come scurry mouse b...
2,experience others din night week say jerk chic...
3,grapefruit scone taste box salt dump dough top...
4,order half food portion size forgiving place N...


In [9]:
# Add additional stop words since we are recreating the document-term matrix
stop_noun = ["america", 'today', 'thing']
stop_words_noun_agg = text.ENGLISH_STOP_WORDS.union()
# Create a document-term matrix with only nouns
# Store TF-IDF Vectorizer
tv_noun = TfidfVectorizer(stop_words=stop_words_noun_agg, ngram_range = (1,1), max_df = .36, min_df = .05)
# Fit and Transform speech noun text to a TF-IDF Doc-Term Matrix
data_tv_noun = tv_noun.fit_transform(data_nouns.text_cleaned)
# Create data-frame of Doc-Term Matrix with nouns as column names
data_dtm_noun = pd.DataFrame(data_tv_noun.toarray(), columns=tv_noun.get_feature_names())
# Set President's Names as Index
data_dtm_noun.index = df12.index
# Visually inspect Document Term Matrix
data_dtm_noun.head()



Unnamed: 0,area,ask,bar,beer,bland,bring,burger,business,charge,check,...,wait,waiter,waitress,walk,want,water,way,win,work,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.342885,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.423008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.183719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.229058,0.0,0.186402,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.391968,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def display_topics(model, feature_names, num_top_words, topic_names=None):
    '''Given an NMF model, feature_names, and number of top words, print topic number and its top feature names, up to specified number of top words.'''
    # iterate through topics in topic-term matrix, 'H' aka
    # model.components_
    for ix, topic in enumerate(model.components_):
        #print topic, topic number, and top words
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [11]:
def find_topic(nb_topics, data,nb_words_print):
    nmf_model = NMF(nb_topics)
    doc_topic = nmf_model.fit_transform(data)
    display_topics(nmf_model, tv_noun.get_feature_names(), nb_words_print)
    return nmf_model, doc_topic

In [12]:
nmf_model ,doc_topic = find_topic(15, data_dtm_noun, 9)

''' the topics swap from order: the topic 0 could be the 5 the next time i work the code.
the ideas stay between them but rename them from their position is kinda usless  '''




Topic  0
try, look, think, people, want, know, review, thing, way

Topic  1
chicken, rice, sauce, fry, meat, salad, bland, lunch, portion

Topic  2
service, customer, server, location, rude, star, water, need, not_go

Topic  3
pizza, sauce, location, quality, salad, hour, home, guy, use

Topic  4
time, location, money, year, visit, use, experience, return, bring

Topic  5
wait, minute, hour, seat, tell, sit, people, table, walk

Topic  6
drink, bar, night, beer, sit, friend, dinner, area, water

Topic  7
restaurant, dinner, experience, menu, waiter, visit, quality, meal, expect

Topic  8
make, staff, location, customer, rude, feel, home, need, serve

Topic  9
come, server, waiter, ask, waitress, plate, water, table, meal

Topic  10
burger, fry, location, cook, beer, meat, expect, way, think

Topic  11
price, portion, quality, pay, menu, charge, way, use, expect

Topic  12
taste, sauce, bland, flavor, water, quality, cook, meat, rice

Topic  13
say, tell, ask, manager, customer, leave,



Topic  0
say, tell, ask, manager, customer, leave, pay, walk, want

Topic  1
chicken, rice, sauce, fry, bland, meat, salad, portion, lunch

Topic  2
service, customer, server, rude, location, star, water, need, check

Topic  3
time, location, money, year, visit, use, experience, return, leave

Topic  4
pizza, sauce, quality, location, salad, home, guy, minute, use

Topic  5
wait, minute, seat, sit, people, table, walk, tell, leave

Topic  6
drink, bar, beer, night, sit, friend, people, area, seat

Topic  7
restaurant, experience, dinner, waiter, menu, visit, quality, area, seat

Topic  8
burger, fry, location, cook, beer, meat, expect, way, home

Topic  9
make, staff, location, customer, work, rude, people, need, feel

Topic  10
price, portion, quality, pay, charge, menu, way, use, expect

Topic  11
try, eat, look, think, thing, want, know, review, way

Topic  12
taste, sauce, bland, flavor, water, quality, cook, meat, fry

Topic  13
come, server, waiter, ask, waitress, plate, water, friend, experience

Topic  14
hour, half, experience, tell, night, wait, dinner, day, offer

In [108]:
len(['atmosphere_sound','chicken_menu','bad_service','pizza_menu','???','long_wait','drinks','wrong_marketing','dirty','?','burger_menu','over_priced','not_tasty','not_accessible','???'])

15

In [149]:
topic_nuns = ['atmosphere_sound', 'chicken_menu', 'bad_service','pizza_menu','delivery', 'long_wait', 'drinks', 'wrong_marketing', 'dirty', 'rude_staff', 'burger_menu', 'over_priced', 'not_tasty', 'not_accessible', 'seasoning']
topics = pd.DataFrame(doc_topic, columns=topic_nuns)

'''don t work i don t know why ...
reviews=[]
for i in topic_nuns:
  print(i, topics.nlargest(1,i).index.values)
  #reviews.append(topics.nlargest(1,i).index.values)
print(reviews)

if i = 'service' don't work'''

reviews=[]
for i in topic_nuns:
  reviews.append(topics.nlargest(4,i).index.values[:])
print(reviews)

[array([ 799, 9079, 3309, 1876]), array([ 714, 1833, 1880, 1954]), array([ 49, 113, 771, 839]), array([ 756, 1826, 2749, 3362]), array([ 176, 1346, 2325, 2365]), array([5558, 3413, 4183, 2523]), array([1661, 8513, 6485, 3913]), array([ 492,  966, 2417, 4470]), array([3286, 3735, 5920, 7438]), array([1923, 3924, 4168, 4581]), array([4733, 8608, 5066, 2419]), array([1355, 1730, 6203, 7479]), array([ 146,  498,  888, 1224]), array([3675,  268, 1202, 1739]), array([1175, 5033, 7110, 7343])]


In [None]:
for i in range(len(reviews)):
  print(topic_nuns[i])
  for j in range(len(reviews[0])):
    print(dataset_df.text[reviews[i][j]],'\n')
  print('|||||||||||||||||||||||||||||||||||',)


In [150]:
#processing one by one for the hardest to put name on 
'''reviews_idd = topics.nlargest(100,'?').index.values[:]
print(list(reviews_idd))
for id in reviews_idd:
  print(dataset_df.text_cleaned[int(id)],'\n')

reviews_idd = topics.nlargest(100,'??').index.values[:]
print(list(reviews_idd))
for id in reviews_idd:
  print(dataset_df.text_cleaned[int(id)],'\n')

reviews_idd = topics.nlargest(100,'???').index.values[:]
print(list(reviews_idd))
for id in reviews_idd:
  print(dataset_df.text_cleaned[int(id)],'\n')'''

"reviews_idd = topics.nlargest(100,'?').index.values[:]\nprint(list(reviews_idd))\nfor id in reviews_idd:\n  print(dataset_df.text_cleaned[int(id)],'\n')\n\nreviews_idd = topics.nlargest(100,'??').index.values[:]\nprint(list(reviews_idd))\nfor id in reviews_idd:\n  print(dataset_df.text_cleaned[int(id)],'\n')\n\nreviews_idd = topics.nlargest(100,'???').index.values[:]\nprint(list(reviews_idd))\nfor id in reviews_idd:\n  print(dataset_df.text_cleaned[int(id)],'\n')"

In [151]:
topics

Unnamed: 0,atmosphere_sound,chicken_menu,bad_service,pizza_menu,delivery,long_wait,drinks,wrong_marketing,dirty,rude_staff,burger_menu,over_priced,not_tasty,not_accessible,seasoning
0,0.017373,0.000000,0.000273,0.000000,0.000000,0.000000,0.000000,0.002577,0.006601,0.000000,0.005360,0.004653,0.087385,0.001062,0.114924
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.063118,0.007840,0.003726,0.000000,0.097601,0.000000,0.000532,0.000000,0.064752,0.005767
2,0.019469,0.119238,0.000000,0.000000,0.000000,0.000000,0.070283,0.002483,0.000000,0.005464,0.000000,0.000000,0.000000,0.075814,0.001506
3,0.062545,0.000000,0.004105,0.000000,0.000000,0.000000,0.007005,0.000000,0.000000,0.006955,0.000000,0.000000,0.070485,0.000000,0.000000
4,0.001941,0.009114,0.000000,0.001902,0.000455,0.008089,0.000471,0.004164,0.000000,0.001528,0.001424,0.042010,0.002781,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.006024,0.014509,0.000000,0.000000,0.000000,0.000000,0.000000,0.003104,0.000068,0.000000,0.006395,0.009522,0.008708,0.000000,0.013996
9996,0.067962,0.014899,0.000578,0.045315,0.000000,0.000000,0.000000,0.000000,0.000000,0.028824,0.000000,0.000000,0.031057,0.000000,0.003347
9997,0.000000,0.000000,0.000000,0.000000,0.001540,0.000000,0.052665,0.005247,0.026445,0.017635,0.001951,0.036908,0.000000,0.109827,0.125919
9998,0.030310,0.013136,0.000000,0.005976,0.000000,0.000000,0.000000,0.000000,0.003591,0.000000,0.001825,0.007547,0.014814,0.000000,0.004041


PIKKLE


In [152]:
import pickle
with open('model','wb') as file:
    pickle.dump(nmf_model, file)
file.close()

In [153]:
with open('vect','wb') as file:
    pickle.dump(doc_topic, file)
file.close()

In [155]:
#download file
#! cp '/content/model' '/content/drive/MyDrive/ESEO/I3/S9/Langage_naturel/TP_issou'
#! cp '/content/vect' '/content/drive/MyDrive/ESEO/I3/S9/Langage_naturel/TP_issou'