In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path


from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
DATA_PATH = Path() / "data"
DATA_PATH.mkdir(parents=True,exist_ok=True)

def load_data(filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    return pd.read_csv(csv_path,encoding=encoding)

def save_data(data, filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    data.to_csv(csv_path, index=False,encoding='ISO-8859-1')

PLOT_PATH = Path() / "plot"
PLOT_PATH.mkdir(parents=True,exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300, transparent=True):
    path = PLOT_PATH / f"{fig_id}.{fig_extension}"
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution, transparent=transparent)

In [36]:
data = load_data("McDonald_s_Reviews_Cleaned.csv")
data

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,processed_review_basic,rating_numeric,rating_numerical,state
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,look like someone spit food normal transaction...,1,1.0,TX
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars,it'd mcdonalds. far food atmosphere go. staff ...,4,4.0,TX
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star,made mobile order got speaker checked in. line...,1,1.0,TX
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars,mc. crispy chicken sandwich customer service q...,5,5.0,TX
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star,repeat order 3 time drive thru still manage me...,1,1.0,TX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32644,33392,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,4 years ago,They treated me very badly.,1 star,treated badly.,1,1.0,FL
32645,33393,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,a year ago,The service is very good,5 stars,service good,5,5.0,FL
32646,33394,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,a year ago,To remove hunger is enough,4 stars,remove hunger enough,4,4.0,FL
32647,33395,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,5 years ago,"It's good, but lately it has become very expen...",5 stars,good lately become expensive.,5,5.0,FL


In [37]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Assuming data_cleaned is the DataFrame after removing rows with NaN in "processed_review_basic"
data_reduced = data.sample(frac=1, random_state=42)

# Initialize CountVectorizer
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words="english")

# Fit and transform the text data
dtm_reduced = cv.fit_transform(data_reduced["processed_review_basic"])

# Initialize LDA with 7 topics
lda_reduced = LatentDirichletAllocation(n_components=5, random_state=42)

# Fit LDA to the document term matrix
lda_reduced.fit(dtm_reduced)

# Printing the top 15 words for each of the 7 topics
n = 20  # Number of top words to extract for each topic
top_words_per_topic_reduced_corrected = []

for index, topic in enumerate(lda_reduced.components_):
    top_words = [cv.get_feature_names_out()[i] for i in topic.argsort()[-n:]]
    top_words_per_topic_reduced_corrected.append((index, top_words))

for topic_num, words in top_words_per_topic_reduced_corrected:
    print(f"Topic #{topic_num}:")
    print(words)


Topic #0:
['donald', 'late', 'mcdonald', 'night', 'say', 'place', 'closed', 'mc', 'hour', '24', 'cream', 'food', 'drive', 'ice', 'poor', 'terrible', 'open', 'slow', 'service', 'good']
Topic #1:
['large', 'asked', 'cheese', 'big', 'sauce', 'mcdonald', 'drink', 'nugget', 'time', 'sandwich', 'chicken', 'like', 'meal', 'cold', 'food', 'burger', 'got', 'ordered', 'order', 'fry']
Topic #2:
['mcdonalds', 'ok', 'nice', 'like', 'best', 'coffee', 'love', 'fresh', 'hot', 'place', 'neutral', 'clean', 'quick', 'staff', 'mcdonald', 'friendly', 'great', 'fast', 'service', 'food']
Topic #3:
['bathroom', 'employee', 'worst', 'location', 'lot', 'like', 'homeless', 'table', 'dirty', 'area', 'customer', 'bad', 'staff', 'food', 'nice', 'clean', 'people', 'service', 'place', 'mcdonald']
Topic #4:
['took', 'told', 'wrong', 'got', 'service', 'rude', 'waiting', 'customer', 'said', 'window', 'long', 'line', 'wait', 'manager', 'minute', 'time', 'food', 'drive', 'excellent', 'order']


In [38]:
# Assign topics to the reduced dataset
topic_results_reduced = lda_reduced.transform(dtm_reduced)
data_reduced["Topic"] = topic_results_reduced.argmax(axis=1)

data_reduced.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,processed_review_basic,rating_numeric,rating_numerical,state,Topic
28995,29732,McDonald's,Fast food restaurant,"1415 E State Rd, Fern Park, FL 32730, United S...",28.65535,-81.342692,1618,3 years ago,Neutral,3 stars,neutral,3,3.0,FL,2
31434,32175,McDonald's,Fast food restaurant,"632 S R L Thornton Freeway Service Rd, Dallas,...",32.744596,-96.812286,2658,4 years ago,Nice,4 stars,nice,4,4.0,TX,3
17250,17293,McDonald's,Fast food restaurant,"702-2 Haddonfield-Berlin Rd, Voorhees Township...",39.852059,-74.981099,933,a year ago,These are the slowest off-task workers I've ev...,1 star,slowest task worker i've ever seen. stop multi...,1,1.0,NJ,4
13738,13768,McDonald's,Fast food restaurant,"6875 Sand Lake Rd, Orlando, FL 32819, United S...",28.450387,-81.471414,19671,3 months ago,This McDonaldï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½...,2 stars,mcdonald unique pasta pizza bar arcade upstair...,2,2.0,FL,3
24577,25283,McDonald's,Fast food restaurant,"7010 Bradlick Shopping Center, Annandale, VA 2...",38.812823,-77.185055,595,5 years ago,Terrible,1 star,terrible,1,1.0,VA,0


In [39]:
data_reduced['Topic'].value_counts().sort_index()

0    4929
1    4638
2    8188
3    6251
4    8643
Name: Topic, dtype: int64

In [40]:
save_data(data_reduced,"McDonald_s_Reviews_Cleaned_Classified.csv")