# Data Analysis: Reviews
## Analysis of the reviews dataset

In [1]:
import os
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn import preprocessing

from src.data.data_reader import DataReader

while str(os.getcwd())[-3:] != 'src':  # Execute from src-directory root
    os.chdir('..')

In [None]:
entries = DataReader._get_entries_from_file(Path('..', 'data', DataReader.EXPECTED_FILES[2]))
RELEVANT_REVIEW_FIELDS = [
    'review_id',
    'user_id',
    'business_id',
    'stars',
    'useful',
    'funny',
    'cool',
    'text',
    'date'
]

filtered_entries = DataReader._filter_entries(entries, RELEVANT_REVIEW_FIELDS)
reviews = pd.DataFrame.from_records(filtered_entries)
reviews

Normalisation

In [None]:
normalised_column = pd.Series(
    data =
    preprocessing.MinMaxScaler().fit_transform(
        reviews['stars'].to_numpy().reshape(-1, 1)
    ).flatten(),
    name = 'stars_normalised',
    dtype = np.float16,
).set_axis(reviews.index)  # To relink with the original dataframe
reviews = reviews.drop(columns=['stars'])
reviews = pd.concat([reviews, normalised_column], axis=1)
reviews

In [None]:
# cleanup of other fields
reviews['useful'] = reviews['useful'].transform(lambda x: 0 if x == 0 else 1).astype(np.uint8)
reviews['funny_cool'] = reviews[['funny', 'cool']].apply(lambda row: 0 if row['funny'] == 0 and row['cool'] == 1 else 1, axis=1).rename("funny_cool").astype(np.uint8)
reviews = reviews.drop(columns=['funny', 'cool'])
reviews['date'] = reviews['date'].map(lambda date_str: datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S'))

reviews

Only keep reviews for restaurants

In [5]:
(businesses, _, _), _ = DataReader().read_data()
reviews = reviews[reviews['business_id'].isin(businesses.index)]
reviews = reviews.set_index('review_id')
reviews['text'] = reviews['text'].astype("string")
reviews.info()

Could not reach caches!
Reading files from disk: 100%|██████████| 3/3 [03:44<00:00, 74.70s/it, current: tips]        


<class 'pandas.core.frame.DataFrame'>
Index: 4731031 entries, KU_O5udG6zpxOg-VcAEodg to RwcKOdEuLRHNJe4M9-qpqg
Data columns (total 7 columns):
 #   Column            Dtype         
---  ------            -----         
 0   user_id           object        
 1   business_id       object        
 2   useful            uint8         
 3   text              string        
 4   date              datetime64[ns]
 5   stars_normalised  float16       
 6   funny_cool        uint8         
dtypes: datetime64[ns](1), float16(1), object(2), string(1), uint8(2)
memory usage: 198.5+ MB


In [2]:
(businesses, reviews, _), _ = DataReader().read_data(no_train_test=True)

                                                                                              

In [3]:
reviews

Unnamed: 0_level_0,user_id,business_id,useful,text,date,stars_normalised,funny_cool
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,652,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,0.50,1
1,1,4603,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,0.50,1
2,2,2239,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,1.00,0
3,3,2161,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,0.75,0
4,4,972,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,0.00,1
...,...,...,...,...,...,...,...
4731026,33640,49745,1,Don't misinterpret my 5-star review....I don't...,2016-04-30 01:02:34,1.00,0
4731027,79346,47733,1,It is very rare for a restaurant to be this go...,2022-01-17 22:36:01,1.00,0
4731028,65859,48813,1,"Good, maybe very good. I went for lunch, so t...",2015-01-23 23:52:03,0.75,1
4731029,171192,48509,1,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27,1.00,1


In [16]:
from sklearn.model_selection import train_test_split

train_reviews, test_reviews = train_test_split(reviews, train_size=0.8)
train_reviews: pd.DataFrame = train_reviews
test_reviews: pd.DataFrame = test_reviews
# Iedere business uit de testset moet ook in de trainset zitten
train_restaurants = pd.DataFrame(index=train_reviews.groupby(['business_id']).count().index)
test_reviews = test_reviews.join(train_restaurants, on='business_id', how='inner')
# Iedere user uit de testset moet ook in de trainset zitten
train_users = pd.DataFrame(index=train_reviews.groupby(['user_id']).count().index)
test_reviews = test_reviews.join(train_users, on='user_id', how='inner')

test_reviews

946207


Unnamed: 0_level_0,user_id,business_id,useful,text,date,stars_normalised,funny_cool
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4671879,225154,49025,0,I have given this place a 5 star review severa...,2021-10-30 23:40:02,0.25,1
3145117,225154,31991,1,This is based on the fact that Hattie's decide...,2018-06-08 16:29:35,0.00,1
2184130,225154,21550,1,Awesome happy hour and solid drink selection f...,2017-06-28 15:58:58,0.75,0
1179437,225154,12095,0,below average sushi. This place is like most b...,2015-12-04 16:05:47,0.25,1
2151562,225154,21258,0,Very good food and service. Its a quaint littl...,2015-12-04 17:19:49,1.00,1
...,...,...,...,...,...,...,...
3684490,200380,38000,0,"Normally a good experience, the employees are ...",2018-03-07 14:58:55,0.50,1
308114,208722,1174,0,Went to the Gravois and grand location for my ...,2019-06-10 16:34:46,0.25,1
552037,335924,6483,1,I absolute love love love this food.. Besides ...,2014-12-05 17:16:42,1.00,1
829788,445131,9691,0,I ordered my pizza online and it started being...,2020-11-25 06:38:05,0.00,1
