In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import streamlit as st

# Download the raw data from Yelp

URL: https://www.yelp.com/dataset/download
        
The total file size is 4GB. 

# Import raw data (json) and convert into CSV

It takes 5 minutes to run

In [10]:
import json

def conv_json_to_csv(inFile, outFile):
    with open(inFile) as f:
        data = []
        for line in f:
            if line.strip():  # Check if the line is not empty
                obj = json.loads(line)
                data.append(obj)

    # Open the CSV file in write mode
    with open(outFile, 'w', newline='') as file:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # Write the header row
        writer.writeheader()

        # Write the data rows
        writer.writerows(data)
        
conv_json_to_csv('raw_data/yelp_academic_dataset_review.json', 'notebooks/csv/yelp_review.csv')
conv_json_to_csv('raw_data/yelp_academic_dataset_business.json', 'notebooks/csv/yelp_business.csv')

# Create the dataframes (Business / Review / Rene)

In [8]:
df_business = pd.read_csv('notebooks/csv/yelp_business.csv')

# Ask Sophia to get the CSV file
df_review = pd.read_csv('notebooks/csv/new_orleans_reviews.csv')

# Ask Rene to get the CSV file
df_rene = pd.read_csv('notebooks/csv/avg_stars10m_radius.csv')

# Combine the 3 Tables into 1

## Combine df_business & df_rene

In [9]:
df_business = df_business[['business_id', 'stars','name','review_count','categories']]
df_business.head(2)

Unnamed: 0,business_id,stars,name,review_count,categories
0,Pns2l4eNsfO8kk83dixA6A,5.0,"Abby Rappoport, LAC, CMQ",7,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,3.0,The UPS Store,15,"Shipping Centers, Local Services, Notaries, Ma..."


In [10]:
df_rene = df_rene[["business_id","avg_stars10m_radius"]]
df_rene.head(2)

Unnamed: 0,business_id,avg_stars10m_radius
0,-0TffRSXXIlBYVbb5AwfTg,4.016129
1,-FM4CxOg4XXmX_Ebky_SiQ,4.192308


In [11]:
df_business = pd.merge(df_business, df_rene, on='business_id', how='inner')
df_business.head(2)

Unnamed: 0,business_id,stars,name,review_count,categories,avg_stars10m_radius
0,SZU9c8V2GuREDN5KgyHFJw,4.0,Santa Barbara Shellfish Company,2404,"Live/Raw Food, Restaurants, Seafood, Beer Bar,...",4.192308
1,ORL4JE6tz3rJxVqkdKfegA,3.0,Gaylord Opryland Resort & Convention Center,1639,"Venues & Event Spaces, Performing Arts, Arts &...",4.166667


## Combine df_review & df_business

In [14]:
df_review_business = pd.merge(df_review, df_business, on='business_id', how='inner')
df_review_business = df_review_business.rename(columns={'stars_x': 'stars'})
df_review_business = df_review_business.rename(columns={'stars_y': 'restaurant_avg_star'})
df_review_business.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,restaurant_avg_star,name,review_count,categories,avg_stars10m_radius
0,z0osLHDvXvzfm57D4DmD2Q,xVKE_HJ2pwUtTdLbL3pnCg,S2Ho8yLxhKAa26pBAm6rxA,3.0,0,0,0,"Service was crappy, and food was mediocre. I ...",2016-11-22 00:22:53,4.0,Creole House Restaurant & Oyster Bar,1594,"Cajun/Creole, Seafood, Restaurants, Breakfast ...",4.05814
1,tXHWJWnTdrraHGUqaPWj3g,zKAHSNzqvwsyoFCw3QpafA,S2Ho8yLxhKAa26pBAm6rxA,4.0,0,0,0,Enjoyed my fish out at a sidewalk table. A bi...,2018-05-19 01:14:05,4.0,Creole House Restaurant & Oyster Bar,1594,"Cajun/Creole, Seafood, Restaurants, Breakfast ...",4.05814


In [15]:
df_review_business = df_review_business[['review_id','business_id', 'stars','name','text','restaurant_avg_star','review_count','avg_stars10m_radius']]
df_review_business.head(2)

Unnamed: 0,review_id,business_id,stars,name,text,restaurant_avg_star,review_count,avg_stars10m_radius
0,z0osLHDvXvzfm57D4DmD2Q,S2Ho8yLxhKAa26pBAm6rxA,3.0,Creole House Restaurant & Oyster Bar,"Service was crappy, and food was mediocre. I ...",4.0,1594,4.05814
1,tXHWJWnTdrraHGUqaPWj3g,S2Ho8yLxhKAa26pBAm6rxA,4.0,Creole House Restaurant & Oyster Bar,Enjoyed my fish out at a sidewalk table. A bi...,4.0,1594,4.05814


# Pick Top 10 Restaurants


## df_review_top10

In [17]:
review_counts = df_review_business['business_id'].value_counts().reset_index(name='review_count')
review_counts.columns = ['business_id', 'review_count']

top_10_business_ids = review_counts.head(10)['business_id']

df_review_top10 = df_review_business[df_review_business['business_id'].isin(top_10_business_ids)]

df_review_top10.shape

(52968, 8)

In [18]:
df_review_top10.head(2)

Unnamed: 0,review_id,business_id,stars,name,text,restaurant_avg_star,review_count,avg_stars10m_radius
7987,yyrMqY7sNp5gT7KJ1AaYWA,GBTPC53ZrG1ZBY3DT8Mbcw,4.0,Luke,We have been here twice for brunch and have en...,4.0,4554,4.05814
7988,wEfzqOfbwn4Ohe2ZDOLAzw,GBTPC53ZrG1ZBY3DT8Mbcw,4.0,Luke,First meal in New Orleans. I had the $15 lunch...,4.0,4554,4.05814


In [24]:
top10_restaurants = df_review_top10['name'].unique()
top10_restaurants

array(['Luke', 'Gumbo Shop', "Commander's Palace", 'Royal House',
       "Felix's Restaurant & Oyster Bar", 'Cochon', "Mother's Restaurant",
       'Oceana Grill', 'Acme Oyster House', 'Ruby Slipper - New Orleans'],
      dtype=object)

## df_business_top10

In [30]:
df_business_top10 = df_business[df_business['name'].isin(top10_restaurants)]
df_business_top10.head(10)

Unnamed: 0,business_id,stars,name,review_count,categories,avg_stars10m_radius
11,GBTPC53ZrG1ZBY3DT8Mbcw,4.0,Luke,4554,"German, Restaurants, Seafood, Cocktail Bars, F...",4.05814
73,VQcCL9PiNL_wkGf-uF3fjg,4.0,Royal House,5070,"American (New), Restaurants, Sandwiches, Seafo...",4.05814
75,_C7QiQQc47AOEv4PE3Kong,4.5,Commander's Palace,4876,"French, Restaurants, Cocktail Bars, Nightlife,...",4.05814
96,qb28j-FNX1_6xm7u372TZA,4.0,Gumbo Shop,3902,"Cajun/Creole, Seafood, Restaurants",4.05814
130,VaO-VW3e1kARkU9bP1E7Fw,4.0,Felix's Restaurant & Oyster Bar,3966,"Restaurants, Sandwiches, Seafood, Cajun/Creole",4.05814
182,6a4gLLFSgr-Q6CZXDLzBGQ,4.0,Cochon,4421,"Cajun/Creole, Seafood, Restaurants",4.05814
187,iSRTaT9WngzB8JJ2YKJUig,3.5,Mother's Restaurant,5185,"Cajun/Creole, Restaurants, Event Planning & Se...",4.05814
215,ac1AeYqs8Z4_e2X5M3if2A,4.0,Oceana Grill,7400,"Restaurants, Seafood, Cajun/Creole, Breakfast ...",4.05814
218,_ab50qdWOk0DdB6XOrBitw,4.0,Acme Oyster House,7568,"Live/Raw Food, Seafood, Restaurants, Cajun/Creole",4.05814
294,oBNrLz4EDhiscSlbOl8uAw,4.5,Ruby Slipper - New Orleans,5193,"Restaurants, American (Traditional), American ...",4.05814


# Preprocess - Clean Text


In [31]:
def remove_punctuation(text):
    return "".join(char for char in text if not char in string.punctuation)
preprocessed_stopwords= [remove_punctuation(word) for word in stopwords.words('english')]

def preprocessing(sentence):
    # remove whitespace
    t1 = sentence.strip()
    # lowercase characters
    t2 = t1.lower()
    # remove numbers
    
    t3 = ''.join(char for char in t2 if not char.isdigit())
    # remove punctuation
    t4 = "".join(char for char in t3 if not char in string.punctuation)
    # tokenize
    tokens = t4.split(" ")
    # lemmatize
    stopwords_removed = [
        word for word in tokens if word not in preprocessed_stopwords
    ]
    return " ".join(stopwords_removed)

# Clean reviews
df_review_top10['clean_text'] = df_review_top10['text'].apply(preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_review_top10['clean_text'] = df_review_top10['text'].apply(preprocessing)


In [32]:
df_review_top10.head(2)

Unnamed: 0,review_id,business_id,stars,name,text,restaurant_avg_star,review_count,avg_stars10m_radius,clean_text
7987,yyrMqY7sNp5gT7KJ1AaYWA,GBTPC53ZrG1ZBY3DT8Mbcw,4.0,Luke,We have been here twice for brunch and have en...,4.0,4554,4.05814,twice brunch enjoyed immensely everything trie...
7988,wEfzqOfbwn4Ohe2ZDOLAzw,GBTPC53ZrG1ZBY3DT8Mbcw,4.0,Luke,First meal in New Orleans. I had the $15 lunch...,4.0,4554,4.05814,first meal new orleans lunch special seafood ...


# Export dataframes to CSV

In [33]:
df_review_top10.to_csv('notebooks/csv/df_review_top10.csv', index=False)

In [34]:
df_business_top10.to_csv('notebooks/csv/df_business_top10.csv', index=False)

# Preprocess - Vectorize

This cannot run

In [1]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorized_text = vectorizer.fit_transform(df_review_top10['clean_text'])
vectorized_text = pd.DataFrame(
    vectorized_text.toarray(),
    columns = vectorizer.get_feature_names_out()
)

NameError: name 'TfidfVectorizer' is not defined