In [1]:
!pip install gensim
!pip install nltk



In [12]:
from gensim.models import FastText
from nltk.corpus import gutenberg
import nltk
import pandas as pd

In [122]:
df = pd.read_csv('../datasets/McDonald_s_Reviews.csv', encoding='latin1')

In [123]:
df.shape

(33396, 10)

In [124]:
df.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [125]:
df = df.dropna()
# take 1000  sample
df = df.sample(10000)

In [126]:
# preprocess the text
import spacy
import re

nlp = spacy.load('en_core_web_sm')

def preprocess(sent):
    sent = re.sub(r'[^\w\s]', '', sent)
    sent = re.sub(r' +', ' ', sent) # remove extra spaces
    sent = re.sub(r'\d+', '', sent) # remove numbers
    sent = re.sub(r'[^\x00-\x7F]+', '', sent) # remove non-english caracters
    sent = sent.lower()
    doc = nlp(sent)
    text = ' '.join([token.lemma_ for token in doc if not token.is_stop])
    return text

In [127]:
preprocess("I'm going to McDonald's")

'm go mcdonald'

In [128]:
# apply the function
df['cleaned_text'] = df['review'].map(preprocess)

In [129]:
df_to_use = df['review']

In [130]:
df_to_use[:4]

24781    I work around this area and have stopped here ...
8080     This is a good place. Sometimes a little under...
33276    Excellent service, you can't eat there yet, bu...
9401     Staff behind the counter watched a homeless dr...
Name: review, dtype: object

In [131]:
# Tokenize the sentences
tokenized_sentences = [sentence.lower().split() for sentence in df_to_use]


In [132]:
tokenized_sentences[:2]

[['i',
  'work',
  'around',
  'this',
  'area',
  'and',
  'have',
  'stopped',
  'here',
  'a',
  'handful',
  'of',
  'times,',
  'every',
  'time',
  "i've",
  'waited',
  'at',
  'least',
  '10-15mins',
  'and',
  "cashier's",
  'are',
  'rude.',
  'i',
  'will',
  'go',
  'out',
  'of',
  'my',
  'way',
  'to',
  'not',
  'stop',
  'here',
  'again'],
 ['this',
  'is',
  'a',
  'good',
  'place.',
  'sometimes',
  'a',
  'little',
  'under',
  'staffed',
  'though.']]

In [133]:
# Train a FastText model
model = FastText(tokenized_sentences, min_count=1, vector_size=50, workers=4)

In [134]:
model.wv['cake']

array([-0.8406737 , -1.0084485 ,  0.1577413 , -0.47743762, -0.29762778,
        0.8986689 , -0.7937771 ,  0.42376137, -0.5066521 ,  0.05232739,
        1.0431557 , -0.31913328,  0.87096316,  0.68241984,  0.17450829,
        1.153313  ,  0.82497495,  0.19908705, -0.22270794, -0.79118043,
       -0.5161277 , -0.11860053,  0.8163841 ,  0.11286387, -0.3464386 ,
        0.1869431 , -0.42747468, -0.25715938, -1.3455997 , -0.14256155,
        0.9827322 ,  0.5412291 , -0.45143846,  1.7495729 , -0.06359109,
        1.0259166 ,  0.15628625,  0.78693545,  0.29224518,  0.60676485,
        0.7753454 , -0.05755339,  0.41251317,  0.95919   ,  1.4550781 ,
       -1.2452607 ,  0.31863514,  0.2930865 , -0.20839477,  0.30887115],
      dtype=float32)

In [135]:
import numpy as np

In [136]:
sent1 = ['cake', 'is', 'amazing']    
word_vectors = np.array([model.wv[word] for word in sent1])

# Average the word vectors
avg_vector = np.mean(word_vectors, axis=0)

# Find the most similar word to the average vector
# This is a simple way to predict the next word, but it's not very accurate
most_similar = model.wv.most_similar([avg_vector], topn=1)
most_similar[0][0]

'est.'

In [153]:
def predict_next_word(model, sentence):    
    # split the sentence into words
    sentence = sentence.lower().split()
    
    # Get the word vector for each word in the sentence
    word_vectors = np.array([model.wv[word] for word in sentence])

    # Average the word vectors
    avg_vector = np.mean(word_vectors, axis=0)

    # Find the most similar word to the average vector
    # This is a simple way to predict the next word, but it's not very accurate
    most_similar = model.wv.most_similar([avg_vector], topn=1)

    return most_similar[0][0]


In [167]:
predict_next_word(model, preprocess("Don't expect the place to be dirty"))

'place....'

In [168]:
predict_next_word(model, preprocess('Night time drive through  '))

'arrive'