In [0]:
#APPENDIX A: DATA ACQUISITION CODE

# Configure AWS CLI.
aws configure
_____		# AWS Access Key ID
_____		# Secret Access Key
us-east-2	# Default region name
json		# Default output format

In [0]:
# Install pip.
sudo yum -y install python3-pip
# Instal Kaggle CLI.
pip3 install Kaggle

In [0]:
# Made directory for Kaggle. 
mkdir .kaggle
# Create file.
nano .kaggle/kaggle.json 
# [paste username and key]
# Secure file.
chmod 600 .kaggle/kaggle.json

In [0]:
# Download each file from Kaggle.
kaggle datasets download -d cynthiarempel/amazon-us-customer-reviews-dataset -f amazon_reviews_us_Apparel_v1_00.tsv
# Unzip file.
unzip amazon_reviews_us_Apparel_v1_00.tsv.zip
# Put file in landing folder of S3 bucket.
aws s3 cp amazon_reviews_us_Apparel_v1_00.tsv s3://my-bigdata-project-sf/landing/ amazon_reviews_us_Apparel_v1_00.tsv

In [0]:
# Remove file to save space in EC2 instance.
rm amazon_reviews_us_Apparel_v1_00.tsv

In [0]:
# APPENDIX B: EXPLORATORY DATA ANALYSIS CODE

# Import the necessary functions.
python3
import boto3
import pandas as pd

In [0]:
# Read in Amazon Review tsv file into a data frame.
reviews_df = pd.read_csv("s3://my-bigdata-project-sf/landing/amazon_reviews_us_Gift_Card_v1_00.tsv", sep='\t', on_bad_lines='skip')

In [0]:
# Get the data type of each column in the file.
reviews_df.dtypes

In [0]:
# Look at the first five rows of data.
print(reviews_df.head(5))

In [0]:
# Look at the basic information about the data frame, such as non-null values in each column.
print(reviews_df.info())

In [0]:
# Get the count, mean, standard deviation, minimum, and maximum for the numerical data.
print(reviews_df.describe())

In [0]:
# Find out which columns in the data frame have null values.
print(reviews_df.columns[reviews_df.isnull().any()].tolist())
# Find out how many records in the data frame have null values.
print("Rows with null values:", reviews_df.isnull().any(axis=1).sum())

In [0]:
# See how each star_rating compares to the total_votes.
results = reviews_df.groupby('star_rating').total_votes.agg(['count', 'min', 'max', 'mean'])
print(results)

In [0]:
# Get the number of words for each record of review text in the review_body column.
num_words = reviews_df["review_body"].str.split().str.len()
print(num_words)

In [0]:
# Look at the most popular products in the file.
print(reviews_df['product_title'].value_counts())

In [0]:
# Look at the most popular star_rating among the products in the file.
print(reviews_df['star_rating'].value_counts())

In [0]:
# Find the maximum and minimum review_date in the file.
reviews_df = pd.read_csv("s3://my-bigdata-project-sf/landing/amazon_reviews_us_Gift_Card_v1_00.tsv", sep='\t', on_bad_lines='skip', parse_dates=['review_date'])
print(reviews_df['review_date'].max())
print(reviews_df['review_date'].min())

In [0]:
# Create a histogram of the star_rating data.
import matplotlib.pyplot as plt
plt.hist(reviews_df['star_rating'])