In [1]:
# Import Dependencies
import pandas as pd
import os

In [2]:
# Load CSV File 
vine_df = pd.read_csv('./resources/vine_table.csv') 
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,R1QX6706ZWJ1P5,5,0,0,N,Y
1,R3QWMLJHIW6P37,5,0,0,N,Y
2,R14Z1VR1N0Z9G6,5,1,1,N,Y
3,R25ZRJL0GH0U0,2,0,0,N,Y
4,R3837KYH7AZNIY,4,0,1,N,Y


In [3]:
vine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85981 entries, 0 to 85980
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   review_id          85981 non-null  object
 1   star_rating        85981 non-null  int64 
 2   helpful_votes      85981 non-null  int64 
 3   total_votes        85981 non-null  int64 
 4   vine               85981 non-null  object
 5   verified_purchase  85981 non-null  object
dtypes: int64(3), object(3)
memory usage: 3.9+ MB


In [4]:
# # of vine reviews
vine_reviews = vine_df[vine_df['vine']=='Y']
vine_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32 entries, 17581 to 32557
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   review_id          32 non-null     object
 1   star_rating        32 non-null     int64 
 2   helpful_votes      32 non-null     int64 
 3   total_votes        32 non-null     int64 
 4   vine               32 non-null     object
 5   verified_purchase  32 non-null     object
dtypes: int64(3), object(3)
memory usage: 1.8+ KB


In [5]:
# # of nonvine reviews
nonvine_reviews = vine_df[vine_df['vine']=='N']
nonvine_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85949 entries, 0 to 85980
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   review_id          85949 non-null  object
 1   star_rating        85949 non-null  int64 
 2   helpful_votes      85949 non-null  int64 
 3   total_votes        85949 non-null  int64 
 4   vine               85949 non-null  object
 5   verified_purchase  85949 non-null  object
dtypes: int64(3), object(3)
memory usage: 4.6+ MB


In [6]:
vine_df['star_rating'].value_counts()

5    48897
4    13657
1    11036
3     7050
2     5341
Name: star_rating, dtype: int64

In [7]:
# Filter the data and create a new DataFrame or table to retrieve all the rows where 
# the total_votes count is equal to or greater than 20 to pick reviews that are more 
# likely to be helpful and to avoid having division by zero errors later on.
over_20_votes_df = vine_df.loc[vine_df["total_votes"] >= 20]
over_20_votes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3414 entries, 884 to 85980
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   review_id          3414 non-null   object
 1   star_rating        3414 non-null   int64 
 2   helpful_votes      3414 non-null   int64 
 3   total_votes        3414 non-null   int64 
 4   vine               3414 non-null   object
 5   verified_purchase  3414 non-null   object
dtypes: int64(3), object(3)
memory usage: 186.7+ KB


In [8]:
# Filter the new DataFrame or table created in Step 1 and create a new DataFrame 
# or table to retrieve all the rows where the number of helpful_votes divided by 
# total_votes is equal to or greater than 50%.
Helpful_Votes_Over_50_df = over_20_votes_df[(over_20_votes_df['helpful_votes']/over_20_votes_df['total_votes']) >= .50]
Helpful_Votes_Over_50_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
884,R2PT2X8FMYFCV3,5,39,41,N,Y
917,RGCUQSH80SYY7,1,36,47,N,N
939,R3E6EI073KP0AL,2,21,36,N,Y
947,R3MYTXW7B4Z6YW,4,687,711,N,Y
1211,R2WMF1IWN5ZYO4,5,18,24,N,Y


In [9]:
Helpful_Votes_Over_50_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
884,R2PT2X8FMYFCV3,5,39,41,N,Y
917,RGCUQSH80SYY7,1,36,47,N,N
939,R3E6EI073KP0AL,2,21,36,N,Y
947,R3MYTXW7B4Z6YW,4,687,711,N,Y
1211,R2WMF1IWN5ZYO4,5,18,24,N,Y
...,...,...,...,...,...,...
85976,R1NUYN39WEVD9X,5,96,98,N,N
85977,RD17SQQ58L34O,5,46,48,N,N
85978,R30DX2RCMIKQ90,5,31,34,N,N
85979,R1O8C9XEYHQUIH,2,38,41,N,N


In [10]:
# Filter the DataFrame or table created in Step 2, and create a new DataFrame 
# or table that retrieves all the rows where a review was written as part of 
# the Vine program (paid), vine == 'Y'.
paid_over_50_df = Helpful_Votes_Over_50_df[Helpful_Votes_Over_50_df['vine']=='Y']
paid_over_50_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
32351,R3R2RHQWEKX5NL,5,81,89,Y,N
32362,R1RZ4JZO8DAP7H,3,200,227,Y,N
32557,RQ94HI0WM8KIG,5,174,188,Y,N


In [11]:
# Repeat Step 3, but this time retrieve all the rows where the review was not
# part of the Vine program (unpaid), vine == 'N'.
unpaid_over_50_df = Helpful_Votes_Over_50_df[Helpful_Votes_Over_50_df['vine']=='N']
unpaid_over_50_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
884,R2PT2X8FMYFCV3,5,39,41,N,Y
917,RGCUQSH80SYY7,1,36,47,N,N
939,R3E6EI073KP0AL,2,21,36,N,Y
947,R3MYTXW7B4Z6YW,4,687,711,N,Y
1211,R2WMF1IWN5ZYO4,5,18,24,N,Y


In [12]:
unpaid_over_50_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
884,R2PT2X8FMYFCV3,5,39,41,N,Y
917,RGCUQSH80SYY7,1,36,47,N,N
939,R3E6EI073KP0AL,2,21,36,N,Y
947,R3MYTXW7B4Z6YW,4,687,711,N,Y
1211,R2WMF1IWN5ZYO4,5,18,24,N,Y
...,...,...,...,...,...,...
85976,R1NUYN39WEVD9X,5,96,98,N,N
85977,RD17SQQ58L34O,5,46,48,N,N
85978,R30DX2RCMIKQ90,5,31,34,N,N
85979,R1O8C9XEYHQUIH,2,38,41,N,N


In [13]:
# Determine the total number of reviews, 
# the percentage of 5-star reviews for the two types of review 
reviews = len(pd.unique(vine_df['review_id']))
print("# of reviews:", reviews)

# of reviews: 85981


In [14]:
# the number of 5-star reviews,
total_5_star_reviews = vine_df['star_rating'].value_counts()[5]
print("# of 5 star reviews:", total_5_star_reviews)

# of 5 star reviews: 48897


In [15]:
# the count of 5-star reviews paid
total_paid_5_star_reviews_df = vine_df[vine_df['vine']=='Y']
total_paid_5_star_reviews = total_paid_5_star_reviews_df['star_rating'].value_counts()[5]
print("# of 5 star paid reviews:", total_paid_5_star_reviews)

# of 5 star paid reviews: 24


In [16]:
# the percentage of 5-star reviews paid
total_paid_5_star_reviews_percent = total_paid_5_star_reviews_df['star_rating'].value_counts()[5]/total_5_star_reviews*100
print("% of 5 star paid reviews:",total_paid_5_star_reviews_percent)

% of 5 star paid reviews: 0.04908276581385361


In [17]:
# the count of 5-star reviews unpaid
total_unpaid_5_star_reviews_df = vine_df[vine_df['vine']=='N']
total_unpaid_5_star_reviews = total_unpaid_5_star_reviews_df['star_rating'].value_counts()[5]
print("# of 5 star unpaid reviews:",total_unpaid_5_star_reviews)

# of 5 star unpaid reviews: 48873


In [18]:
# the percentage of 5-star reviews unpaid
total_unpaid_5_star_reviews_percent = total_unpaid_5_star_reviews_df['star_rating'].value_counts()[5]/total_5_star_reviews*100
print("% of 5 star paid reviews:",total_unpaid_5_star_reviews_percent)

% of 5 star paid reviews: 99.95091723418614
