# Data Extraction

In the present section we will discuss the steps we took to derive features that could be helpful in building a regression model which could help us to predict the helpfulness score of a review.

In [2]:
# Importing necessary packages

import tqdm
import nltk
import sqlite3
import textstat
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from textblob import TextBlob

nltk.download('brown')

In [3]:
# Connecting to database
cnx = sqlite3.connect('./data/yelp_db.sqlite')

## 1. Restaurant Features

In [4]:
# Reading restaurant information from the table
df_restaurants = pd.read_sql_query("select * from restaurants", cnx)
df_restaurants.shape

(52268, 14)

#### Elapsed Days

In [5]:
# Query to retrieve the first review date for each restaurant
df_restaurant_first_review = pd.read_sql_query("SELECT MIN(date) as first_review_date, business_id FROM restaurant_reviews GROUP BY business_id", cnx)

# Set 'business_id' as the index of df_restaurant_first_review
df_restaurant_first_review.set_index('business_id', inplace=True)

# Join df_restaurant_first_review with df_restaurants on 'business_id'
df_restaurants = df_restaurants.join(df_restaurant_first_review, on='business_id', how='left')

# Convert 'first_review_date' column to datetime format
df_restaurants['first_review_date'] = pd.to_datetime(df_restaurants['first_review_date'], format='%Y-%m-%d %H:%M:%S')

# Retrieve the maximum review date from the 'reviews' table
max_review_date = pd.read_sql_query("SELECT MAX(date) FROM reviews", cnx)
max_review_date = datetime.strptime(max_review_date.values[0][0], '%Y-%m-%d %H:%M:%S')

# Calculate the number of elapsed days since the first review date
df_restaurants['max_date'] = max_review_date
df_restaurants['elapsed_days'] = (df_restaurants['max_date'] - df_restaurants['first_review_date']).dt.days

df_restaurants.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,first_review_date,max_date,elapsed_days
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.9555052,-75.1555641,4.0,80,1,"{""RestaurantsDelivery"":""False"",""OutdoorSeating...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{""Monday"":""7:0-20:0"",""Tuesday"":""7:0-20:0"",""Wed...",2008-03-09 00:36:56,2022-01-19 19:48:45,5064
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{""BusinessParking"":""None"",""BusinessAcceptsCred...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-22:0"",""Wedn...",2012-12-18 08:45:44,2022-01-19 19:48:45,3319
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.5651648,-90.3210868,3.0,19,0,"{""Caters"":""True"",""Alcohol"":""u'full_bar'"",""Rest...","Pubs, Restaurants, Italian, Bars, American (Tr...",,2012-04-16 23:53:04,2022-01-19 19:48:45,3564
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.2081024,-86.7681696,1.5,10,1,"{""RestaurantsAttire"":""'casual'"",""RestaurantsGo...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-21:0"",""Wedn...",2011-07-01 23:01:11,2022-01-19 19:48:45,3854
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.9552692,-82.4563199,4.0,10,1,"{""Alcohol"":""'none'"",""OutdoorSeating"":""None"",""R...","Vietnamese, Food, Restaurants, Food Trucks","{""Monday"":""11:0-14:0"",""Tuesday"":""11:0-14:0"",""W...",2018-07-25 14:35:33,2022-01-19 19:48:45,1274


#### Total Reviews

In [6]:
# Query to retrieve total number of reviews for each restaurant
df_total_reviews = pd.read_sql_query("select count(distinct (review_id)) as total_reviews, business_id from restaurant_reviews group by business_id", cnx)

# Set 'business_id' as the index of df_total_reviews
df_total_reviews.set_index('business_id', inplace=True)

# Join df_total_reviews with df_restaurants on 'business_id'
df_restaurants = df_restaurants.join(df_total_reviews, on='business_id', how='left')
df_restaurants.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,first_review_date,max_date,elapsed_days,total_reviews
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.9555052,-75.1555641,4.0,80,1,"{""RestaurantsDelivery"":""False"",""OutdoorSeating...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{""Monday"":""7:0-20:0"",""Tuesday"":""7:0-20:0"",""Wed...",2008-03-09 00:36:56,2022-01-19 19:48:45,5064,87
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{""BusinessParking"":""None"",""BusinessAcceptsCred...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-22:0"",""Wedn...",2012-12-18 08:45:44,2022-01-19 19:48:45,3319,6
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.5651648,-90.3210868,3.0,19,0,"{""Caters"":""True"",""Alcohol"":""u'full_bar'"",""Rest...","Pubs, Restaurants, Italian, Bars, American (Tr...",,2012-04-16 23:53:04,2022-01-19 19:48:45,3564,19
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.2081024,-86.7681696,1.5,10,1,"{""RestaurantsAttire"":""'casual'"",""RestaurantsGo...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-21:0"",""Wedn...",2011-07-01 23:01:11,2022-01-19 19:48:45,3854,10
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.9552692,-82.4563199,4.0,10,1,"{""Alcohol"":""'none'"",""OutdoorSeating"":""None"",""R...","Vietnamese, Food, Restaurants, Food Trucks","{""Monday"":""11:0-14:0"",""Tuesday"":""11:0-14:0"",""W...",2018-07-25 14:35:33,2022-01-19 19:48:45,1274,11


#### Total Positive Reviews and Ratio of Positive Reviews

In [7]:
# Query to retrieve count of review with star rating >= 4 for each restaurant
df_total_positive_reviews = pd.read_sql_query("select count(distinct (review_id)) as positive_reviews, business_id from restaurant_reviews where stars >= 4 group by business_id", cnx)

# filling any non-existing values with 0
df_total_positive_reviews['positive_reviews'].fillna(0, inplace=True)

# Set 'business_id' as the index of df_total_positive_reviews
df_total_positive_reviews.set_index('business_id', inplace=True)

# Join df_total_positive_reviews with df_restaurants on 'business_id'
df_restaurants = df_restaurants.join(df_total_positive_reviews, on='business_id', how='left')

# computing the ratio of positive review by dividing `positive_reviews` with `total_reviews` of each restaurant
df_restaurants['ratio_positive_reviews'] = df_restaurants['positive_reviews'] / df_restaurants['total_reviews']
df_restaurants.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,first_review_date,max_date,elapsed_days,total_reviews,positive_reviews,ratio_positive_reviews
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.9555052,-75.1555641,4.0,80,1,"{""RestaurantsDelivery"":""False"",""OutdoorSeating...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{""Monday"":""7:0-20:0"",""Tuesday"":""7:0-20:0"",""Wed...",2008-03-09 00:36:56,2022-01-19 19:48:45,5064,87,69.0,0.793103
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{""BusinessParking"":""None"",""BusinessAcceptsCred...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-22:0"",""Wedn...",2012-12-18 08:45:44,2022-01-19 19:48:45,3319,6,2.0,0.333333
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.5651648,-90.3210868,3.0,19,0,"{""Caters"":""True"",""Alcohol"":""u'full_bar'"",""Rest...","Pubs, Restaurants, Italian, Bars, American (Tr...",,2012-04-16 23:53:04,2022-01-19 19:48:45,3564,19,10.0,0.526316
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.2081024,-86.7681696,1.5,10,1,"{""RestaurantsAttire"":""'casual'"",""RestaurantsGo...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-21:0"",""Wedn...",2011-07-01 23:01:11,2022-01-19 19:48:45,3854,10,1.0,0.1
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.9552692,-82.4563199,4.0,10,1,"{""Alcohol"":""'none'"",""OutdoorSeating"":""None"",""R...","Vietnamese, Food, Restaurants, Food Trucks","{""Monday"":""11:0-14:0"",""Tuesday"":""11:0-14:0"",""W...",2018-07-25 14:35:33,2022-01-19 19:48:45,1274,11,8.0,0.727273


#### Total Negative Reviews and Ratio of Negative Reviews

In [8]:
# Query to retrieve count of review with star rating <= 2 for each restaurant
df_total_negative_reviews = pd.read_sql_query("select count(distinct (review_id)) as negative_reviews, business_id from restaurant_reviews where stars <= 2 group by business_id", cnx)

# filling any non-existing values with 0
df_total_negative_reviews['negative_reviews'].fillna(0, inplace=True)

# Set 'business_id' as the index of df_total_negative_reviews
df_total_negative_reviews.set_index('business_id', inplace=True)

# Join df_total_negative_reviews with df_restaurants on 'business_id'
df_restaurants = df_restaurants.join(df_total_negative_reviews, on='business_id', how='left')

# computing the ratio of negative review by dividing `negative_reviews` with `total_reviews` of each restaurant
df_restaurants['ratio_negative_reviews'] = df_restaurants['negative_reviews'] / df_restaurants['total_reviews']
df_restaurants.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,categories,hours,first_review_date,max_date,elapsed_days,total_reviews,positive_reviews,ratio_positive_reviews,negative_reviews,ratio_negative_reviews
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.9555052,-75.1555641,4.0,80,...,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{""Monday"":""7:0-20:0"",""Tuesday"":""7:0-20:0"",""Wed...",2008-03-09 00:36:56,2022-01-19 19:48:45,5064,87,69.0,0.793103,9.0,0.103448
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,...,"Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-22:0"",""Wedn...",2012-12-18 08:45:44,2022-01-19 19:48:45,3319,6,2.0,0.333333,4.0,0.666667
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.5651648,-90.3210868,3.0,19,...,"Pubs, Restaurants, Italian, Bars, American (Tr...",,2012-04-16 23:53:04,2022-01-19 19:48:45,3564,19,10.0,0.526316,8.0,0.421053
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.2081024,-86.7681696,1.5,10,...,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-21:0"",""Wedn...",2011-07-01 23:01:11,2022-01-19 19:48:45,3854,10,1.0,0.1,8.0,0.8
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.9552692,-82.4563199,4.0,10,...,"Vietnamese, Food, Restaurants, Food Trucks","{""Monday"":""11:0-14:0"",""Tuesday"":""11:0-14:0"",""W...",2018-07-25 14:35:33,2022-01-19 19:48:45,1274,11,8.0,0.727273,1.0,0.090909


#### Immediacy Index

Restaurant with large score shows that there is less number of reviews posted but all reviews have high ratings  [Malik and Hussain, 2017]

$=\frac{Average\ review\ rating\ of\ the\ restaurant}{Total\ reviews\ for\ the\ restaurant}$

[Malik and Hussain, 2017]: https://www.sciencedirect.com/science/article/abs/pii/S0747563217302121

In [9]:
df_restaurants['immediacy_index'] = df_restaurants['stars'] / df_restaurants['total_reviews']
df_restaurants.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,hours,first_review_date,max_date,elapsed_days,total_reviews,positive_reviews,ratio_positive_reviews,negative_reviews,ratio_negative_reviews,immediacy_index
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.9555052,-75.1555641,4.0,80,...,"{""Monday"":""7:0-20:0"",""Tuesday"":""7:0-20:0"",""Wed...",2008-03-09 00:36:56,2022-01-19 19:48:45,5064,87,69.0,0.793103,9.0,0.103448,0.045977
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,...,"{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-22:0"",""Wedn...",2012-12-18 08:45:44,2022-01-19 19:48:45,3319,6,2.0,0.333333,4.0,0.666667,0.333333
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.5651648,-90.3210868,3.0,19,...,,2012-04-16 23:53:04,2022-01-19 19:48:45,3564,19,10.0,0.526316,8.0,0.421053,0.157895
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.2081024,-86.7681696,1.5,10,...,"{""Monday"":""0:0-0:0"",""Tuesday"":""6:0-21:0"",""Wedn...",2011-07-01 23:01:11,2022-01-19 19:48:45,3854,10,1.0,0.1,8.0,0.8,0.15
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.9552692,-82.4563199,4.0,10,...,"{""Monday"":""11:0-14:0"",""Tuesday"":""11:0-14:0"",""W...",2018-07-25 14:35:33,2022-01-19 19:48:45,1274,11,8.0,0.727273,1.0,0.090909,0.363636


### Curated Restaurant Features

In [10]:
df_restaurants = df_restaurants[['business_id', 'state', 'stars', 'total_reviews', 'elapsed_days', 'positive_reviews', 'ratio_positive_reviews', 'negative_reviews', 'ratio_negative_reviews', 'immediacy_index']]
df_restaurants.head()

Unnamed: 0,business_id,state,stars,total_reviews,elapsed_days,positive_reviews,ratio_positive_reviews,negative_reviews,ratio_negative_reviews,immediacy_index
0,MTSW4McQd7CbVtyjqoe9mw,PA,4.0,87,5064,69.0,0.793103,9.0,0.103448,0.045977
1,CF33F8-E6oudUQ46HnavjQ,TN,2.0,6,3319,2.0,0.333333,4.0,0.666667,0.333333
2,k0hlBqXX-Bt0vf1op7Jr1w,MO,3.0,19,3564,10.0,0.526316,8.0,0.421053,0.157895
3,bBDDEgkFA1Otx9Lfe7BZUQ,TN,1.5,10,3854,1.0,0.1,8.0,0.8,0.15
4,eEOYSgkmpB90uNA7lDOMRA,FL,4.0,11,1274,8.0,0.727273,1.0,0.090909,0.363636


In [11]:
df_restaurants.shape

(52268, 10)

## 2. User Features

In [19]:
# Reading restaurant users information from the table
df_restaurant_users = pd.read_sql_query("select * from restaurant_users", cnx)
df_restaurant_users.shape

(1445984, 22)

#### Activity Length

In [20]:
# Query to compute activity length from table by returning day elapsed between oldest and latest review
df_activity = pd.read_sql_query("select user_id, max(date) as last_review_date, min(date) as first_review_date, CAST((julianday(max(date)) - julianday(min(date))) AS INT) as activity_length from restaurant_reviews group by user_id", cnx)

# filling any non-existing values with 0
df_activity['activity_length'].isnull().sum(axis=0)

# Set 'user_id' as the index of df_activity
df_activity.set_index('user_id', inplace=True)

# Join df_activity with df_restaurant_users on 'user_id'
df_restaurant_users = df_restaurant_users.join(df_activity, on='user_id', how='left')
df_restaurant_users.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,last_review_date,first_review_date,activity_length
0,---2PmXbF47D870stH1jqA,Susan,36,2012-10-24 13:20:46,63,4,36,,"o9QkuHIyxmqbORF7PeAbuw, Q70zvjAa9aawgW7KsNVGDA...",3,...,0,0,2,0,0,2,0,2019-04-27 17:35:51,2012-10-24 13:33:39,2376
1,---UgP94gokyCDuB5zUssA,Leonel,16,2014-11-02 14:53:16,8,0,3,,"5tXRxr4T24Awl7vjyCvIcQ, a-Z3HSdECIJmlmpEpsC4Sg...",1,...,0,1,1,0,0,0,0,2021-09-17 17:36:13,2014-11-02 15:10:18,2511
2,---r61b7EpVPkb4UVme5tA,Brian,6,2013-02-07 00:35:37,10,2,3,,"8YekFgT6VzrFDuhx5KZnjQ, RjDtN3v5EHctnZeLiDLiyQ...",1,...,0,0,1,0,0,1,0,2016-11-13 12:35:37,2014-05-06 14:00:28,921
3,---zemaUC8WeJeWKqS6p9Q,Eiman,49,2019-01-21 02:07:11,23,8,8,,"JdNKDCkE__QpZBP-bnbshA, xmo4OqlD8ojBGi1SMDSeHA...",1,...,0,0,0,0,0,1,0,2021-06-23 08:17:42,2021-06-23 08:17:42,0
4,--0DrQkM0FT-yCQRWw82uQ,Jimmy,25,2007-11-13 15:57:53,37,10,7,,5Dg5-7AQTkDYXNf7hUXmaw,0,...,0,0,0,1,1,2,0,2012-10-15 05:59:48,2012-10-15 05:59:48,0


#### Elite Count

In [21]:
# as `elite` field is array of yearys user has been elite user. It splits the array and count elements in the array
df_restaurant_users['elite_count'] = df_restaurant_users['elite'].map(lambda x: len([elite.strip() for elite in x.split(',')]) if x else 0)
df_restaurant_users.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,last_review_date,first_review_date,activity_length,elite_count
0,---2PmXbF47D870stH1jqA,Susan,36,2012-10-24 13:20:46,63,4,36,,"o9QkuHIyxmqbORF7PeAbuw, Q70zvjAa9aawgW7KsNVGDA...",3,...,0,2,0,0,2,0,2019-04-27 17:35:51,2012-10-24 13:33:39,2376,0
1,---UgP94gokyCDuB5zUssA,Leonel,16,2014-11-02 14:53:16,8,0,3,,"5tXRxr4T24Awl7vjyCvIcQ, a-Z3HSdECIJmlmpEpsC4Sg...",1,...,1,1,0,0,0,0,2021-09-17 17:36:13,2014-11-02 15:10:18,2511,0
2,---r61b7EpVPkb4UVme5tA,Brian,6,2013-02-07 00:35:37,10,2,3,,"8YekFgT6VzrFDuhx5KZnjQ, RjDtN3v5EHctnZeLiDLiyQ...",1,...,0,1,0,0,1,0,2016-11-13 12:35:37,2014-05-06 14:00:28,921,0
3,---zemaUC8WeJeWKqS6p9Q,Eiman,49,2019-01-21 02:07:11,23,8,8,,"JdNKDCkE__QpZBP-bnbshA, xmo4OqlD8ojBGi1SMDSeHA...",1,...,0,0,0,0,1,0,2021-06-23 08:17:42,2021-06-23 08:17:42,0,0
4,--0DrQkM0FT-yCQRWw82uQ,Jimmy,25,2007-11-13 15:57:53,37,10,7,,5Dg5-7AQTkDYXNf7hUXmaw,0,...,0,0,1,1,2,0,2012-10-15 05:59:48,2012-10-15 05:59:48,0,0


#### Number of Friends

In [22]:
# as `friends` field is array of user_ids. It splits the array and count elements in the array
df_restaurant_users['no_friends'] = df_restaurant_users['friends'].map(lambda x: len([friend.strip() for friend in x.split(', ')]) if len(str(x)) > 6 else 0)

### Curated Restaurant Users

In [23]:
df_restaurant_users = df_restaurant_users[['user_id', 'review_count', 'useful', 'fans', 'average_stars', 'compliment_photos', 'activity_length', 'elite_count', 'no_friends']]
df_restaurant_users.head()

Unnamed: 0,user_id,review_count,useful,fans,average_stars,compliment_photos,activity_length,elite_count,no_friends
0,---2PmXbF47D870stH1jqA,36,63,3,4.98,0,2376,0,420
1,---UgP94gokyCDuB5zUssA,16,8,1,3.44,0,2511,0,3
2,---r61b7EpVPkb4UVme5tA,6,10,1,4.17,0,921,0,30
3,---zemaUC8WeJeWKqS6p9Q,49,23,1,3.32,0,0,0,26
4,--0DrQkM0FT-yCQRWw82uQ,25,37,0,2.94,0,0,0,1


In [24]:
df_restaurant_users.shape

(1445984, 9)

### Recency, Frequency and Monetary Value

These are additional user features that we intend to use but are stored at the review level. Hence, a new dataframe, which we will merge with the review data in the future. [Ngo-Ye and Sinha, 2014]
- **Recency:** Days between reviewer's current and previous review
- **Frequency:** Reviewer's previous review count
- **Monetary Value:** Average number of helpfulness votes received by the reviewer across all the reviews one has written
=$\frac{\sum\limits_{i=1}^N{x_{\mathrm{i}}-x_{\text {current }}}}{N-1}$
where N is the total number of reviews written by the reviewer, $x_i$ is the number of useful votes for a review i written by the reviewer, and $x_{\text {current }}$ is the number of useful votes for the current review.

[Ngo-Ye and Sinha, 2014]: https://www.sciencedirect.com/science/article/pii/S0167923614000128


In [28]:
scores = []

# Indexing  df_restaurant_users DataFrame by 'user_id'
df_restaurant_users.set_index('user_id', inplace=True)

for user in tqdm(df_restaurant_users.index):
     # SQL query to retrieve the 'review_id', 'useful', and 'date' columns from the 'restaurant_reviews' table for the current user.
    df_user_reviews = pd.read_sql_query("select review_id, useful, date from restaurant_reviews where user_id == '{}'".format(user), cnx)

    # converting to a datetime format
    df_user_reviews['date'] = pd.to_datetime(df_user_reviews['date'], format='%Y-%m-%d %H:%M:%S')

    # sorting in descending order based on the 'date' column
    df_user_reviews.sort_values('date', inplace=True, ascending=False)

    # sum of all values in the 'useful' column
    total_useful_votes = sum(df_user_reviews['useful'].values)

    # computing number of reviews posted by the user
    no_of_reviews = len(df_user_reviews)

    rows = df_user_reviews.values

    for index, row in enumerate(rows):
        # The recency variable is calculated as the difference in days between the 'date' value of the current row and the next row's 'date' value. If it is the last row, recency is set to 0
        recency = (row[2] - rows[index + 1][2]).days if index != no_of_reviews - 1 else 0

        # The frequency variable is calculated as the remaining number of reviews after the current index.
        frequency = no_of_reviews - (index + 1)

        # The monetary variable is calculated as the difference between total_useful_votes and the 'useful' value of the current row, divided by the remaining number of reviews. If there is only one review, monetary is set to 0.0.
        monetary = (total_useful_votes - row[1]) / (no_of_reviews - 1) if no_of_reviews > 1 else 0.0

        # storing the computed scores
        scores.append({
            'review_id': row[0],
            'frequency': frequency,
            'recency': recency,
            'monetary_value': monetary
        })

# creating a new RFM score DataFrame for later use
df_rfm_scores = pd.DataFrame(scores, columns=['review_id', 'frequency', 'recency', 'monetary_value'])
df_rfm_scores.head()

100%|██████████| 1445984/1445984 [1:23:17<00:00, 289.33it/s] 


Unnamed: 0,review_id,frequency,recency,monetary_value
0,CCOvQvA1nNba2LiapbQoJQ,21,489,1.52381
1,tkOmUEU2O1ixVYfE8AXo3Q,20,486,1.571429
2,S6qcxvgYMymqVlWcaNDm2Q,19,2,1.52381
3,oPJZvPTykI8jQfb38m-4_w,18,92,1.52381
4,keNOvKVgHOyIGmZNnMXJHw,17,42,1.52381


In [30]:
df_rfm_scores.shape

(4724464, 4)

## 2. Review Features

In [34]:
# Reading restaurant review with atleast one `useful` vote from the table
df_restaurant_reviews = pd.read_sql_query("select * from restaurant_reviews where useful >= 1", cnx)
df_restaurant_reviews.shape

(1969192, 9)

### Recency, Frequency and Monetary Value

In [35]:
# merging the RFM scores computed in user feature section to the reviews

# Indexing  df_rfm_scores DataFrame by 'review_id'
df_rfm_scores.set_index('review_id', inplace=True)

# Join df_rfm_scores with df_restaurant_reviews on 'review_id'
df_restaurant_reviews = df_restaurant_reviews.join(df_rfm_scores, on='review_id', how='left')

df_restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,frequency,recency,monetary_value
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,0,0,0.0
1,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,48,26,0.31
2,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,0,0,0.0
3,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5,2,0,0,Amazingly amazing wings and homemade bleu chee...,2015-08-07 02:29:16,0,0,0.0
4,LnGZB0fjfgeVDVz5IHuEVA,j2wlzrntrbKwyOcOiB3l3w,rBdG_23USc7DletfZ11xGA,4,1,0,0,The hubby and I have been here on multiple occ...,2014-08-10 19:41:43,49,135,3.68


#### Elapsed Days

In [36]:
df_restaurant_reviews['date'] = pd.to_datetime(df_restaurant_reviews['date'], format='%Y-%m-%d %H:%M:%S')
df_restaurant_reviews['max_date'] = max_review_date
df_restaurant_reviews['elapsed_days'] = (df_restaurant_reviews['max_date'] - df_restaurant_reviews['date']).dt.days
df_restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,frequency,recency,monetary_value,max_date,elapsed_days
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,0,0,0.0,2022-01-19 19:48:45,2572
1,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,48,26,0.31,2022-01-19 19:48:45,1830
2,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,0,0,0.0,2022-01-19 19:48:45,2309
3,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5,2,0,0,Amazingly amazing wings and homemade bleu chee...,2015-08-07 02:29:16,0,0,0.0,2022-01-19 19:48:45,2357
4,LnGZB0fjfgeVDVz5IHuEVA,j2wlzrntrbKwyOcOiB3l3w,rBdG_23USc7DletfZ11xGA,4,1,0,0,The hubby and I have been here on multiple occ...,2014-08-10 19:41:43,49,135,3.68,2022-01-19 19:48:45,2719


#### Day of the week and Month the review was posted

In [37]:
df_restaurant_reviews['day_of_week'] = df_restaurant_reviews['date'].dt.dayofweek
df_restaurant_reviews['month'] = df_restaurant_reviews['date'].dt.month
df_restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,frequency,recency,monetary_value,max_date,elapsed_days,day_of_week,month
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,0,0,0.0,2022-01-19 19:48:45,2572,6,1
1,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,48,26,0.31,2022-01-19 19:48:45,1830,5,1
2,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,0,0,0.0,2022-01-19 19:48:45,2309,2,9
3,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5,2,0,0,Amazingly amazing wings and homemade bleu chee...,2015-08-07 02:29:16,0,0,0.0,2022-01-19 19:48:45,2357,4,8
4,LnGZB0fjfgeVDVz5IHuEVA,j2wlzrntrbKwyOcOiB3l3w,rBdG_23USc7DletfZ11xGA,4,1,0,0,The hubby and I have been here on multiple occ...,2014-08-10 19:41:43,49,135,3.68,2022-01-19 19:48:45,2719,6,8


#### Helpfulness Score

$Helpfulness Score=\frac{\#\ Useful\ votes\ of\ a\ review}{\#\ Useful\ votes\ recived\ by\ a\ restaurant}$

In [38]:
# Computing total useful votes received by the restaurant
df_restaurant_useful = df_restaurant_reviews.groupby(['business_id'])['useful'].sum()
df_restaurant_useful = df_restaurant_useful.to_frame()

# renaming the total useful votes column
df_restaurant_useful.rename(columns={'useful': 'total_business_useful'}, inplace=True)

# Join df_restaurant_reviews with df_restaurant_useful on 'business_id'
df_restaurant_reviews = df_restaurant_reviews.join(df_restaurant_useful, on='business_id', how='left')

# computing helpfulness score for each review
df_restaurant_reviews['helpfulness_score'] = df_restaurant_reviews['useful'] / df_restaurant_reviews['total_business_useful']

df_restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,frequency,recency,monetary_value,max_date,elapsed_days,day_of_week,month,total_business_useful,helpfulness_score
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,0,0,0.0,2022-01-19 19:48:45,2572,6,1,302,0.003311
1,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,48,26,0.31,2022-01-19 19:48:45,1830,5,1,32,0.03125
2,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,0,0,0.0,2022-01-19 19:48:45,2309,2,9,331,0.003021
3,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5,2,0,0,Amazingly amazing wings and homemade bleu chee...,2015-08-07 02:29:16,0,0,0.0,2022-01-19 19:48:45,2357,4,8,301,0.006645
4,LnGZB0fjfgeVDVz5IHuEVA,j2wlzrntrbKwyOcOiB3l3w,rBdG_23USc7DletfZ11xGA,4,1,0,0,The hubby and I have been here on multiple occ...,2014-08-10 19:41:43,49,135,3.68,2022-01-19 19:48:45,2719,6,8,416,0.002404


### filtering reviews that have useful score >= 10

In [41]:
df_restaurant_reviews = df_restaurant_reviews[df_restaurant_reviews['useful'] >= 10]
df_restaurant_reviews.reset_index(inplace=True, drop=True)
df_restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,frequency,recency,monetary_value,max_date,elapsed_days,day_of_week,month,total_business_useful,helpfulness_score
0,MWUGmzyXVIlG2NwvzX0q4Q,Xw7ZjaGfr0WNVt6s_5KZfA,i-tDq8zC7ZmSqSbg_7oddA,5,11,5,8,What an AMAZING occurrence that we ended up he...,2016-11-21 20:04:45,686,3,6.861856,2022-01-19 19:48:45,1884,0,11,115,0.095652
1,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,13,6,5,After a long hiatus from reviewing I have awak...,2010-08-20 19:16:04,21,293,5.3,2022-01-19 19:48:45,4170,4,8,302,0.043046
2,Mb5k1Gllt2_x8l_0WM3lUg,Xw7ZjaGfr0WNVt6s_5KZfA,Iso6F11o-W4g6d7OUEaMng,4,11,4,9,My dad and I were here for lunch and after rea...,2017-10-02 18:22:13,771,5,6.861856,2022-01-19 19:48:45,1570,0,10,41,0.268293
3,zj212mBMgYXhxS0srtlPew,oSeE_p_gYxI82APPWJhJjg,Oun4NN-u5yiHIxDqtJnxgA,4,10,6,5,I'd heard the hype. Seen the NY Times article....,2012-03-01 02:07:58,206,35,2.907407,2022-01-19 19:48:45,3611,3,3,1118,0.008945
4,rzrBiijeQh7ubjfRCr-UtA,Kj-u8Yq1d3mLKitWsDAxpg,YtSqYv1Q_pOltsVPSx54SA,4,12,11,11,"A very upscale and respected chain, with a sho...",2008-04-30 15:26:12,5,0,20.510373,2022-01-19 19:48:45,5012,2,4,392,0.030612


In [42]:
df_restaurant_reviews.shape

(46623, 18)

#### Computing text based features

- **Polarity Score:** Sentiment score between -1.0 (negative sentiment) and 1.0 (positive sentiment)
- **Subjectivity Score:** Subjectivity score between 0.0 (very objective) and 1.0 (very subjective)
- **No. of Nouns:** Number of nouns in the text
- **No. of words:** Number of words in the text
- **No. of sentences:** Number of sentences in the text
- **Words per sentence:** Average words per sentence
- **Automated Readability Index:** Measures text understandability and represents US grade level
- **Coleman-liau Index:** Readability score that relies on characters instead of syllables per word
- **No. of difficult words:**  Number of difficult words in the text

In [43]:
rfm_scores = []

for index in tqdm(range(df_restaurant_reviews.shape[0])):
    text = df_restaurant_reviews.loc[index]['text']
    text_blob = TextBlob(text)

    rfm_scores.append({
        'review_id': df_restaurant_reviews.loc[index]['review_id'],
        'polarity_score': text_blob.sentiment.polarity,
        'subjectivity_score': text_blob.sentiment.subjectivity,
        'no_of_nouns': len(text_blob.noun_phrases),
        'no_of_words': len(text_blob.words),
        'no_of_sentences': len(text_blob.sentences),
        'words_per_sentence': len(text_blob.words) / len(text_blob.sentences),
        'automated_readability_index': textstat.automated_readability_index(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'no_difficult_words': textstat.difficult_words(text)
    })

df_review_text_scores = pd.DataFrame(rfm_scores, columns=[
    'review_id', 'polarity_score', 'subjectivity_score', 'no_of_nouns',
    'no_of_words', 'no_of_sentences', 'words_per_sentence',
    'automated_readability_index', 'coleman_liau_index', 'no_difficult_words'
])
df_review_text_scores.set_index('review_id', inplace=True)
df_review_text_scores.head()

100%|██████████| 46623/46623 [15:22<00:00, 50.51it/s] 


Unnamed: 0_level_0,polarity_score,subjectivity_score,no_of_nouns,no_of_words,no_of_sentences,words_per_sentence,automated_readability_index,coleman_liau_index,no_difficult_words
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MWUGmzyXVIlG2NwvzX0q4Q,0.372619,0.714365,25,327,16,20.4375,9.0,7.14,31
EJWyA5wpdVMji1j4TwSZqQ,0.143225,0.551821,78,687,39,17.615385,9.0,8.18,108
Mb5k1Gllt2_x8l_0WM3lUg,0.413043,0.61256,8,253,15,16.866667,6.5,5.86,26
zj212mBMgYXhxS0srtlPew,0.166018,0.508773,31,349,29,12.034483,6.2,7.4,39
rzrBiijeQh7ubjfRCr-UtA,0.415891,0.532681,19,292,16,18.25,8.3,7.37,41


In [44]:
df_review_text_scores.shape

(46623, 9)

In [45]:
# Join df_review_text_scores with df_restaurant_reviews on 'review_id'
df_restaurant_reviews = df_restaurant_reviews.join(df_review_text_scores, on='review_id', how='left')
df_restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,frequency,...,helpfulness_score,polarity_score,subjectivity_score,no_of_nouns,no_of_words,no_of_sentences,words_per_sentence,automated_readability_index,coleman_liau_index,no_difficult_words
0,MWUGmzyXVIlG2NwvzX0q4Q,Xw7ZjaGfr0WNVt6s_5KZfA,i-tDq8zC7ZmSqSbg_7oddA,5,11,5,8,What an AMAZING occurrence that we ended up he...,2016-11-21 20:04:45,686,...,0.095652,0.372619,0.714365,25,327,16,20.4375,9.0,7.14,31
1,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,13,6,5,After a long hiatus from reviewing I have awak...,2010-08-20 19:16:04,21,...,0.043046,0.143225,0.551821,78,687,39,17.615385,9.0,8.18,108
2,Mb5k1Gllt2_x8l_0WM3lUg,Xw7ZjaGfr0WNVt6s_5KZfA,Iso6F11o-W4g6d7OUEaMng,4,11,4,9,My dad and I were here for lunch and after rea...,2017-10-02 18:22:13,771,...,0.268293,0.413043,0.61256,8,253,15,16.866667,6.5,5.86,26
3,zj212mBMgYXhxS0srtlPew,oSeE_p_gYxI82APPWJhJjg,Oun4NN-u5yiHIxDqtJnxgA,4,10,6,5,I'd heard the hype. Seen the NY Times article....,2012-03-01 02:07:58,206,...,0.008945,0.166018,0.508773,31,349,29,12.034483,6.2,7.4,39
4,rzrBiijeQh7ubjfRCr-UtA,Kj-u8Yq1d3mLKitWsDAxpg,YtSqYv1Q_pOltsVPSx54SA,4,12,11,11,"A very upscale and respected chain, with a sho...",2008-04-30 15:26:12,5,...,0.030612,0.415891,0.532681,19,292,16,18.25,8.3,7.37,41


## 4. Creating final dataset

In [47]:
df_restaurants.set_index('business_id', inplace=True)

# Join df_restaurants with df_restaurant_reviews on 'business_id'
df_restaurant_reviews = df_restaurant_reviews.join(df_restaurants, on='business_id', how='left', rsuffix='_restaurant')
df_restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,frequency,...,no_difficult_words,state,stars_restaurant,total_reviews,elapsed_days_restaurant,positive_reviews,ratio_positive_reviews,negative_reviews,ratio_negative_reviews,immediacy_index
0,MWUGmzyXVIlG2NwvzX0q4Q,Xw7ZjaGfr0WNVt6s_5KZfA,i-tDq8zC7ZmSqSbg_7oddA,5,11,5,8,What an AMAZING occurrence that we ended up he...,2016-11-21 20:04:45,686,...,31,LA,4.0,80,2125,61.0,0.7625,8.0,0.1,0.05
1,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,13,6,5,After a long hiatus from reviewing I have awak...,2010-08-20 19:16:04,21,...,108,PA,4.0,190,4170,152.0,0.8,23.0,0.121053,0.021053
2,Mb5k1Gllt2_x8l_0WM3lUg,Xw7ZjaGfr0WNVt6s_5KZfA,Iso6F11o-W4g6d7OUEaMng,4,11,4,9,My dad and I were here for lunch and after rea...,2017-10-02 18:22:13,771,...,26,LA,4.5,34,1627,27.0,0.794118,3.0,0.088235,0.132353
3,zj212mBMgYXhxS0srtlPew,oSeE_p_gYxI82APPWJhJjg,Oun4NN-u5yiHIxDqtJnxgA,4,10,6,5,I'd heard the hype. Seen the NY Times article....,2012-03-01 02:07:58,206,...,39,PA,4.0,556,3747,451.0,0.811151,48.0,0.086331,0.007194
4,rzrBiijeQh7ubjfRCr-UtA,Kj-u8Yq1d3mLKitWsDAxpg,YtSqYv1Q_pOltsVPSx54SA,4,12,11,11,"A very upscale and respected chain, with a sho...",2008-04-30 15:26:12,5,...,41,PA,3.5,305,5962,196.0,0.642623,57.0,0.186885,0.011475


In [48]:
# Join df_restaurant_users with df_restaurant_reviews on 'user_id'
df_restaurant_reviews = df_restaurant_reviews.join(df_restaurant_users, how='left', on='user_id', rsuffix='_user')
df_restaurant_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,frequency,...,ratio_negative_reviews,immediacy_index,review_count,useful_user,fans,average_stars,compliment_photos,activity_length,elite_count,no_friends
0,MWUGmzyXVIlG2NwvzX0q4Q,Xw7ZjaGfr0WNVt6s_5KZfA,i-tDq8zC7ZmSqSbg_7oddA,5,11,5,8,What an AMAZING occurrence that we ended up he...,2016-11-21 20:04:45,686,...,0.1,0.05,2272,15105,379,4.1,294,4002,6,1257
1,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,13,6,5,After a long hiatus from reviewing I have awak...,2010-08-20 19:16:04,21,...,0.121053,0.021053,38,217,4,3.69,0,3275,0,60
2,Mb5k1Gllt2_x8l_0WM3lUg,Xw7ZjaGfr0WNVt6s_5KZfA,Iso6F11o-W4g6d7OUEaMng,4,11,4,9,My dad and I were here for lunch and after rea...,2017-10-02 18:22:13,771,...,0.088235,0.132353,2272,15105,379,4.1,294,4002,6,1257
3,zj212mBMgYXhxS0srtlPew,oSeE_p_gYxI82APPWJhJjg,Oun4NN-u5yiHIxDqtJnxgA,4,10,6,5,I'd heard the hype. Seen the NY Times article....,2012-03-01 02:07:58,206,...,0.086331,0.007194,324,999,20,2.7,2,1799,4,100
4,rzrBiijeQh7ubjfRCr-UtA,Kj-u8Yq1d3mLKitWsDAxpg,YtSqYv1Q_pOltsVPSx54SA,4,12,11,11,"A very upscale and respected chain, with a sho...",2008-04-30 15:26:12,5,...,0.186885,0.011475,793,19181,290,3.58,2296,1816,3,758


In [49]:
# listing all columns
df_restaurant_reviews.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date', 'frequency', 'recency', 'monetary_value',
       'max_date', 'elapsed_days', 'day_of_week', 'month',
       'total_business_useful', 'helpfulness_score', 'polarity_score',
       'subjectivity_score', 'no_of_nouns', 'no_of_words', 'no_of_sentences',
       'words_per_sentence', 'automated_readability_index',
       'coleman_liau_index', 'no_difficult_words', 'state', 'stars_restaurant',
       'total_reviews', 'elapsed_days_restaurant', 'positive_reviews',
       'ratio_positive_reviews', 'negative_reviews', 'ratio_negative_reviews',
       'immediacy_index', 'review_count', 'useful_user', 'fans',
       'average_stars', 'compliment_photos', 'activity_length', 'elite_count',
       'no_friends'],
      dtype='object')

In [50]:
# creating final dataset with the needed features
df_restaurant_reviews_final = df_restaurant_reviews[['stars', 'useful', 'funny', 'cool', 'recency', 'frequency', 'monetary_value', 'elapsed_days', 'day_of_week', 'month', 'total_business_useful', 'polarity_score', 'subjectivity_score', 'no_of_nouns', 'no_of_words', 'no_of_sentences', 'words_per_sentence', 'automated_readability_index', 'coleman_liau_index', 'no_difficult_words', 'stars_restaurant', 'total_reviews', 'elapsed_days_restaurant', 'positive_reviews', 'ratio_positive_reviews', 'negative_reviews', 'ratio_negative_reviews', 'immediacy_index', 'review_count', 'useful_user', 'activity_length', 'fans', 'average_stars', 'elite_count', 'no_friends', 'helpfulness_score']]
df_restaurant_reviews_final.shape

(46623, 36)

In [51]:
df_restaurant_reviews_final.head()

Unnamed: 0,stars,useful,funny,cool,recency,frequency,monetary_value,elapsed_days,day_of_week,month,...,ratio_negative_reviews,immediacy_index,review_count,useful_user,activity_length,fans,average_stars,elite_count,no_friends,helpfulness_score
0,5,11,5,8,3,686,6.861856,1884,0,11,...,0.1,0.05,2272,15105,4002,379,4.1,6,1257,0.095652
1,5,13,6,5,293,21,5.3,4170,4,8,...,0.121053,0.021053,38,217,3275,4,3.69,0,60,0.043046
2,4,11,4,9,5,771,6.861856,1570,0,10,...,0.088235,0.132353,2272,15105,4002,379,4.1,6,1257,0.268293
3,4,10,6,5,35,206,2.907407,3611,3,3,...,0.086331,0.007194,324,999,1799,20,2.7,4,100,0.008945
4,4,12,11,11,0,5,20.510373,5012,2,4,...,0.186885,0.011475,793,19181,1816,290,3.58,3,758,0.030612


# 5. Saving the datasets

In [52]:
df_restaurant_reviews.to_csv('./data/processed_review_data.csv', index=False)
df_restaurant_reviews_final.to_csv('./data/final_data_useful_gte_10.csv', index=False)