In [5]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import streamlit as st

# Download the raw data from Yelp

URL: https://www.yelp.com/dataset/download
        
The total file size is 4GB. 

# Import raw data (json) and convert into CSV

It takes 5 minutes to run

In [10]:
import json

def conv_json_to_csv(inFile, outFile):
    with open(inFile) as f:
        data = []
        for line in f:
            if line.strip():  # Check if the line is not empty
                obj = json.loads(line)
                data.append(obj)

    # Open the CSV file in write mode
    with open(outFile, 'w', newline='') as file:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # Write the header row
        writer.writeheader()

        # Write the data rows
        writer.writerows(data)
        
conv_json_to_csv('raw_data/yelp_academic_dataset_review.json', 'notebooks/csv/yelp_review.csv')
conv_json_to_csv('raw_data/yelp_academic_dataset_business.json', 'notebooks/csv/yelp_business.csv')

# Create the dataframes (Business / Review / Rene)

In [6]:
df_business = pd.read_csv('notebooks/csv/yelp_business.csv')

# Ask Sophia to get the CSV file
df_review = pd.read_csv('notebooks/csv/new_orleans_reviews.csv')

# Ask Rene to get the CSV file
df_rene = pd.read_csv('notebooks/csv/avg_stars10m_radius.csv')

# Combine the 3 Tables into 1

## Combine df_business & df_rene

In [7]:
df_business = df_business[['business_id', 'stars','name','review_count','categories']]
df_business.head(2)

Unnamed: 0,business_id,stars,name,review_count,categories
0,Pns2l4eNsfO8kk83dixA6A,5.0,"Abby Rappoport, LAC, CMQ",7,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,3.0,The UPS Store,15,"Shipping Centers, Local Services, Notaries, Ma..."


In [8]:
df_rene = df_rene[["business_id","avg_stars10m_radius"]]
df_rene.head(2)

Unnamed: 0,business_id,avg_stars10m_radius
0,-0TffRSXXIlBYVbb5AwfTg,4.016129
1,-FM4CxOg4XXmX_Ebky_SiQ,4.192308


In [9]:
df_business = pd.merge(df_business, df_rene, on='business_id', how='inner')
df_business.head(2)

Unnamed: 0,business_id,stars,name,review_count,categories,avg_stars10m_radius
0,SZU9c8V2GuREDN5KgyHFJw,4.0,Santa Barbara Shellfish Company,2404,"Live/Raw Food, Restaurants, Seafood, Beer Bar,...",4.192308
1,ORL4JE6tz3rJxVqkdKfegA,3.0,Gaylord Opryland Resort & Convention Center,1639,"Venues & Event Spaces, Performing Arts, Arts &...",4.166667


In [10]:
df_business.to_csv('notebooks/csv/df_business.csv', index=False)

## Combine df_review & df_business

In [11]:
df = pd.merge(df_review, df_business, on='business_id', how='inner')
df = df.rename(columns={'stars_x': 'stars'})
df = df.rename(columns={'stars_y': 'restaurant_avg_star'})
df.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,restaurant_avg_star,name,review_count,categories,avg_stars10m_radius
0,z0osLHDvXvzfm57D4DmD2Q,xVKE_HJ2pwUtTdLbL3pnCg,S2Ho8yLxhKAa26pBAm6rxA,3.0,0,0,0,"Service was crappy, and food was mediocre. I ...",2016-11-22 00:22:53,4.0,Creole House Restaurant & Oyster Bar,1594,"Cajun/Creole, Seafood, Restaurants, Breakfast ...",4.05814
1,tXHWJWnTdrraHGUqaPWj3g,zKAHSNzqvwsyoFCw3QpafA,S2Ho8yLxhKAa26pBAm6rxA,4.0,0,0,0,Enjoyed my fish out at a sidewalk table. A bi...,2018-05-19 01:14:05,4.0,Creole House Restaurant & Oyster Bar,1594,"Cajun/Creole, Seafood, Restaurants, Breakfast ...",4.05814


In [12]:
df = df[['review_id','business_id', 'stars','name','text','restaurant_avg_star','review_count','avg_stars10m_radius']]
df.head(2)

Unnamed: 0,review_id,business_id,stars,name,text,restaurant_avg_star,review_count,avg_stars10m_radius
0,z0osLHDvXvzfm57D4DmD2Q,S2Ho8yLxhKAa26pBAm6rxA,3.0,Creole House Restaurant & Oyster Bar,"Service was crappy, and food was mediocre. I ...",4.0,1594,4.05814
1,tXHWJWnTdrraHGUqaPWj3g,S2Ho8yLxhKAa26pBAm6rxA,4.0,Creole House Restaurant & Oyster Bar,Enjoyed my fish out at a sidewalk table. A bi...,4.0,1594,4.05814


# Preprocess - Clean Text


In [None]:
def remove_punctuation(text):
    return "".join(char for char in text if not char in string.punctuation)
preprocessed_stopwords= [remove_punctuation(word) for word in stopwords.words('english')]

def preprocessing(sentence):
    # remove whitespace
    t1 = sentence.strip()
    # lowercase characters
    t2 = t1.lower()
    # remove numbers
    
    t3 = ''.join(char for char in t2 if not char.isdigit())
    # remove punctuation
    t4 = "".join(char for char in t3 if not char in string.punctuation)
    # tokenize
    tokens = t4.split(" ")
    # lemmatize
    stopwords_removed = [
        word for word in tokens if word not in preprocessed_stopwords
    ]
    return " ".join(stopwords_removed)

# Clean reviews
df['clean_text'] = df['text'].apply(preprocessing)

In [None]:
df.head(2)

# Pick Top 10 Restaurants and Export to CSV

In [None]:
review_counts = df['business_id'].value_counts().reset_index(name='review_count')
review_counts.columns = ['business_id', 'review_count']

top_10_business_ids = review_counts.head(10)['business_id']

df_top10 = df[df['business_id'].isin(top_10_business_ids)]

df_top10.head(2)

In [None]:
df_top10.to_csv('notebooks/csv/df_top10.csv', index=False)

# Preprocess - Vectorize

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorized_text = vectorizer.fit_transform(df_top10['clean_text'])
vectorized_text = pd.DataFrame(
    vectorized_text.toarray(),
    columns = vectorizer.get_feature_names_out()
)