### I have already proven the process of transforming the raw data into the form I will use for my project by doing the process on a small subset of the data, but will now do the process for my entire dataset.

In [1]:
import numpy as np
import pandas as pd

In [2]:
amazon_df = pd.read_csv(r'rough_data\amazon_dataset\amzn_all_items_simplified.csv')

In [3]:
amazon_df

Unnamed: 0,item,rating
0,0000000078,5.0
1,0000000116,4.0
2,0000000116,1.0
3,0000000868,4.0
4,0000013714,4.0
...,...,...
82677126,BT00IU6O8K,3.0
82677127,BT00IU6O8K,5.0
82677128,dp-g310/do,5.0
82677129,SMLRBIMX03,5.0


There are 82,677,131 reviews in mydataset

In [4]:
amazon_df['rating'].value_counts()

5.0    49169663
4.0    15480820
3.0     7049301
1.0     6712117
2.0     4265230
Name: rating, dtype: int64

In [5]:
amazon_df['rating'].value_counts().sum()

82677131

The above two calculations ensure that all ratings are integer values between 1 and 5

In [6]:
amazon_df['item'].value_counts()

B0054JZC6E    25368
B00FAPF5U0    24024
B009UX2YAC    23956
0439023483    21398
030758836X    19867
              ...  
B0018EI6I0        1
B002CGSYRQ        1
B000MUSTCE        1
B001PFZFLS        1
B004N3BQDU        1
Name: item, Length: 9874211, dtype: int64

There are 9,874,211 unique items in my dataset

In [4]:
# Create the structure of the final dataframe
grouped_df = pd.DataFrame(amazon_df['item'].unique(), columns=['item'])
# Create a series for number of reviews in each rating category
for star in range(1,6):
    grouped_df = pd.merge(grouped_df, amazon_df[amazon_df['rating'] == star].groupby('item').count(), \
                          how='left', left_on='item', right_index=True)
    grouped_df = grouped_df.fillna(0)
    grouped_df = grouped_df.rename(columns={'rating': f'{star}_star_ratings'})

# Form a total number of ratings column and turn raw ratings counts into distributions.
grouped_df['total_ratings'] = (grouped_df['1_star_ratings'] + grouped_df['2_star_ratings'] 
                            + grouped_df['3_star_ratings'] + grouped_df['4_star_ratings'] 
                            + grouped_df['5_star_ratings'])

# Form an average rating column.
grouped_df['average_rating'] = (1*grouped_df['1_star_ratings'] + 2*grouped_df['2_star_ratings'] 
                            + 3*grouped_df['3_star_ratings'] + 4*grouped_df['4_star_ratings'] 
                            + 5*grouped_df['5_star_ratings']) / grouped_df['total_ratings']

grouped_df

Unnamed: 0,item,1_star_ratings,2_star_ratings,3_star_ratings,4_star_ratings,5_star_ratings,total_ratings,average_rating
0,0000000078,0.0,0.0,0.0,0.0,1.0,1.0,5.000000
1,0000000116,1.0,0.0,0.0,1.0,0.0,2.0,2.500000
2,0000000868,0.0,0.0,0.0,1.0,0.0,1.0,4.000000
3,0000013714,1.0,0.0,1.0,4.0,8.0,14.0,4.285714
4,0000015393,0.0,0.0,0.0,1.0,0.0,1.0,4.000000
...,...,...,...,...,...,...,...,...
9874206,BT00DDZD6G,0.0,0.0,0.0,0.0,3.0,3.0,5.000000
9874207,BT00E0U25U,0.0,0.0,0.0,1.0,0.0,1.0,4.000000
9874208,BT00IU6O8K,0.0,0.0,2.0,0.0,2.0,4.0,4.000000
9874209,dp-g310/do,0.0,0.0,0.0,0.0,1.0,1.0,5.000000


## Check to make sure all the data came through correctly

In [5]:
#Test that all the values came through and match original table.
print('1-star test:', amazon_df['rating'].value_counts()[1.0] == grouped_df['1_star_ratings'].sum())
print('2-star test:', amazon_df['rating'].value_counts()[2.0] == grouped_df['2_star_ratings'].sum())
print('3-star test:', amazon_df['rating'].value_counts()[3.0] == grouped_df['3_star_ratings'].sum())
print('4-star test:', amazon_df['rating'].value_counts()[4.0] == grouped_df['4_star_ratings'].sum())
print('5-star test:', amazon_df['rating'].value_counts()[5.0] == grouped_df['5_star_ratings'].sum())
print('total test:', amazon_df['rating'].value_counts().sum() == grouped_df['total_ratings'].sum())

1-star test: True
2-star test: True
3-star test: True
4-star test: True
5-star test: True
total test: True


### Get the data in the final format I want

In [6]:
# Format the data how I want it.
grouped_df['1_star_ratings'] = grouped_df['1_star_ratings'] / grouped_df['total_ratings']
grouped_df['2_star_ratings'] = grouped_df['2_star_ratings'] / grouped_df['total_ratings']
grouped_df['3_star_ratings'] = grouped_df['3_star_ratings'] / grouped_df['total_ratings']
grouped_df['4_star_ratings'] = grouped_df['4_star_ratings'] / grouped_df['total_ratings']
grouped_df['5_star_ratings'] = grouped_df['5_star_ratings'] / grouped_df['total_ratings']

#Reorder columns for maximum usefulness
grouped_df = grouped_df[['item', 'average_rating', 'total_ratings', '5_star_ratings', '4_star_ratings', \
                       '3_star_ratings', '2_star_ratings', '1_star_ratings']]

#This is my complete data set.
grouped_df

Unnamed: 0,item,average_rating,total_ratings,5_star_ratings,4_star_ratings,3_star_ratings,2_star_ratings,1_star_ratings
0,0000000078,5.000000,1.0,1.000000,0.000000,0.000000,0.0,0.000000
1,0000000116,2.500000,2.0,0.000000,0.500000,0.000000,0.0,0.500000
2,0000000868,4.000000,1.0,0.000000,1.000000,0.000000,0.0,0.000000
3,0000013714,4.285714,14.0,0.571429,0.285714,0.071429,0.0,0.071429
4,0000015393,4.000000,1.0,0.000000,1.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...
9874206,BT00DDZD6G,5.000000,3.0,1.000000,0.000000,0.000000,0.0,0.000000
9874207,BT00E0U25U,4.000000,1.0,0.000000,1.000000,0.000000,0.0,0.000000
9874208,BT00IU6O8K,4.000000,4.0,0.500000,0.000000,0.500000,0.0,0.000000
9874209,dp-g310/do,5.000000,1.0,1.000000,0.000000,0.000000,0.0,0.000000


I will save this as the fully transformed data that I will use in the analysis for my project.

In [7]:
# Commented out this line to avoid accidental overwriting of the file
# grouped_df.to_csv(r'rough_data\amazon_dataset\amzn_final_format.csv', index=False)

I will also save a version with only the reviews that have 25+ reviews. I think my cutoff will likely be closer to 50-100 reviews in my project, but I will save down to 25 reviews/product to give myself room to test that the distributions break down at those low review count levels. It will still cut off 90-95% of the data and make it easier for me.

In [8]:
truncated_df = grouped_df.sort_values('total_ratings', ascending=False)[:534899]
truncated_df

Unnamed: 0,item,average_rating,total_ratings,5_star_ratings,4_star_ratings,3_star_ratings,2_star_ratings,1_star_ratings
6170732,B0054JZC6E,4.329313,25368.0,0.523810,0.330377,0.114436,0.014073,0.017305
9281145,B00FAPF5U0,4.369381,24024.0,0.637696,0.209832,0.079878,0.029346,0.043248
8040103,B009UX2YAC,4.674445,23956.0,0.788529,0.144056,0.038654,0.010853,0.017908
328335,0439023483,4.644406,21398.0,0.770726,0.155529,0.039957,0.015001,0.018787
176975,030758836X,3.794433,19867.0,0.388081,0.278351,0.160266,0.086525,0.086777
...,...,...,...,...,...,...,...,...
1988712,9881998328,4.600000,25.0,0.680000,0.240000,0.080000,0.000000,0.000000
9580346,B00HGFILLM,4.720000,25.0,0.800000,0.120000,0.080000,0.000000,0.000000
8145751,B00A7ZXBE0,4.800000,25.0,0.800000,0.200000,0.000000,0.000000,0.000000
2077167,B00000632P,4.360000,25.0,0.680000,0.160000,0.040000,0.080000,0.040000


In [9]:
# Commented out this line to avoid accidental overwriting of the file
# truncated_df.to_csv(r'rough_data\amazon_dataset\amzn_25plusreviews_final_format.csv', index=False)