#**Section 1: Prepare Data**

##**Section 1.05: Import Data and Prepare Filepaths**

In [10]:
import json
import pandas as pd

In [11]:
file_paths = [
    '/content/BeatsPillPlus.json',
    '/content/JBLFlip5.json',
    '/content/JBLFlip6.json',
    '/content/BoseSoundlink.json',
    '/content/AnkerSoundCore2.json',
    '/content/OontZAngle3.json',
    '/content/SonySRSXB13.json',
    '/content/MusiBaby.json',
    '/content/Tribaby.json',
    '/content/MarshallEmbertonII.json'
]

loaded_data = {
    'BeatsPillPlus': '/content/BeatsPillPlus.json',
    'JBLFlip5': '/content/JBLFlip5.json',
    'JBLFlip6': '/content/JBLFlip6.json',
    'BoseSoundlink': '/content/BoseSoundlink.json',
    'AnkerSoundCore2': '/content/AnkerSoundCore2.json',
    'OontZAngle3': '/content/OontZAngle3.json',
    'SonySRSXB13': '/content/SonySRSXB13.json',
    'MusiBaby': '/content/MusiBaby.json',
    'Tribaby': '/content/Tribaby.json',
    'MarshallEmbertonII': '/content/MarshallEmbertonII.json'
}

##**Section 1.1 Load Data**

In [12]:
for file_path in file_paths:
  with open(file_path, 'r') as file:
    loaded_data[file_path.split('/')[-1].split('.')[0]] = json.load(file)

##**Section 1.2 Convert JSON To Pandas DataFrame**

In [13]:
# Initialize an empty list to store review data
reviews_data = []

# Initialize a counter to keep track of the total number of reviews
count = 0

def extract_reviews(product_name, data):
    global count  # Use the global count variable
    # Iterate over each result block in the results list
    for result in data['results']:
        asin = result['content']['asin']  # Get the ASIN of the product
        reviews = result['content']['reviews']  # Access the reviews list

        # Iterate over each review
        for review in reviews:
            count += 1
            # Extract review information and store it in a dictionary
            review_info = {
                'product_name': product_name,
                'review_id': review['id'],
                'asin': asin,
                'review_title': review['title'],
                'author': review['author'],
                'rating': review['rating'],
                'review_text': review['content'],
                'timestamp': review['timestamp'],
                'profile_id': review['profile_id'],
                'is_verified': review['is_verified'],
                'helpful_count': review.get('helpful_count', 0),
                'product_attributes': review.get('product_attributes', "")
            }

            reviews_data.append(review_info)

# Iterate through the loaded_data dictionary
for product_name, data in loaded_data.items():
    print(f"Processing {product_name}...")
    if isinstance(data, dict):
        extract_reviews(product_name, data)

# Print the total number of reviews processed
print(f"\n\nTotal number of reviews processed: {count}")

# Create a DataFrame from the reviews_data list
reviews_df = pd.DataFrame(reviews_data)

# Display the DataFrame
reviews_df.head()

Processing BeatsPillPlus...
Processing JBLFlip5...
Processing JBLFlip6...
Processing BoseSoundlink...
Processing AnkerSoundCore2...
Processing OontZAngle3...
Processing SonySRSXB13...
Processing MusiBaby...
Processing Tribaby...
Processing MarshallEmbertonII...


Total number of reviews processed: 740


Unnamed: 0,product_name,review_id,asin,review_title,author,rating,review_text,timestamp,profile_id,is_verified,helpful_count,product_attributes
0,BeatsPillPlus,R1CSGCR2DCPPS2,B016QXV2QK,5.0 out of 5 stars The best speaker I own to d...,Nicole lopez,5,I already have one that my ex bought me 13 yea...,"Reviewed in the United States April 18, 2023",AEHTY62RUJBKU7VY34I3UYQDN7IA,True,2,Color: Black
1,BeatsPillPlus,RPK8LMOBQ33B5,B016QXV2QK,5.0 out of 5 stars Yes! It is worth it!!!! Buy...,NYC WDSTK JENNIFER,5,Wow! That could be my whole review but just to...,"Reviewed in the United States October 18, 2020",AHIK4P52QYCJEI3C7ZKDEP5E6QSA,True,2,Color: White
2,BeatsPillPlus,R2DQAQ3RL2II2V,B016QXV2QK,5.0 out of 5 stars Even better than our old ja...,Chris,5,We have a jambox original that we’ve had forev...,"Reviewed in the United States July 24, 2020",AEVQN5XRWKZRQAYNJNHKACH5EOMQ,True,0,Color: Black
3,BeatsPillPlus,RQJQ6PKGWO9PV,B016QXV2QK,4.0 out of 5 stars Amazing sound,Amazon Customer,4,I was looking for a good speaker that not only...,"Reviewed in the United States November 27, 2016",AHLS3IUUEKAIK42VFWMWZN2GHQAA,True,18,Color: White
4,BeatsPillPlus,R3430B3F39GCEC,B016QXV2QK,4.0 out of 5 stars Works great!,BSP,4,Part of my job includes giving presentations o...,"Reviewed in the United States January 2, 2020",AHISN46CYDOATATCIHYVMSZLCZHA,True,2,Color: White


In [14]:
len(reviews_df)

740

#**Section 2: Clean Dataset**

##**Section 2.1: Import Nessecary Packages**

In [15]:
import numpy as np

##**Section 2.2: Drop Null (Empty) Values**

In [16]:
# Detect Null Values
reviews_df.isnull().sum()

# Drop Null
reviews_df.dropna(inplace=True)

print(len(reviews_df))

740


##**Section 2.2: Identify And Handle Outliers**

In [17]:
# Ensure ratings are within the expected range
reviews_df = reviews_df[(reviews_df['rating'] >= 1) & (reviews_df['rating'] <= 5)]

# Identify outliers in helpful_count using the IQR (Interquartile Range) method
Q1 = reviews_df['helpful_count'].quantile(0.25)
Q3 = reviews_df['helpful_count'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers in helpful_count
reviews_df = reviews_df[(reviews_df['helpful_count'] >= lower_bound) & (reviews_df['helpful_count'] <= upper_bound)]

In [18]:
# Check for categorical columns that need conversion
print("Data types before conversion:\n", reviews_df.dtypes)

# Convert categorical column 'product_attributes' to numerical using one-hot encoding
reviews_df = pd.concat([reviews_df, pd.get_dummies(reviews_df['product_attributes'], prefix='attr')], axis=1)

# Optionally, you may drop the original 'product_attributes' column
reviews_df.drop(columns=['product_attributes'], inplace=True)

Data types before conversion:
 product_name          object
review_id             object
asin                  object
review_title          object
author                object
rating                 int64
review_text           object
timestamp             object
profile_id            object
is_verified             bool
helpful_count          int64
product_attributes    object
dtype: object


In [22]:
# Review the cleaned dataset
print("What Product would you like to review?")
print(reviews_df['product_name'].unique())

inp = input()

reviews_df[reviews_df['product_name'] == inp]

What Product would you like to review?
['BeatsPillPlus' 'JBLFlip5' 'JBLFlip6' 'BoseSoundlink' 'AnkerSoundCore2'
 'OontZAngle3' 'SonySRSXB13' 'MusiBaby' 'Tribaby' 'MarshallEmbertonII']
BeatsPillPlus


Unnamed: 0,product_name,review_id,asin,review_title,author,rating,review_text,timestamp,profile_id,is_verified,...,attr_Color: White Smoke,attr_Color: WhiteStyle: Flip 6,attr_Style: Flip 5Color: Black,attr_Style: Flip 5Color: Eco Green,attr_Style: Flip 5Color: Gray,attr_Style: Flip 5Color: Green,attr_Style: Flip 5Color: Pink,attr_Style: Flip 5Color: Red,attr_Style: Flip 5Color: Teal,attr_Style: Flip 5Color: White
2,BeatsPillPlus,R2DQAQ3RL2II2V,B016QXV2QK,5.0 out of 5 stars Even better than our old ja...,Chris,5,We have a jambox original that we’ve had forev...,"Reviewed in the United States July 24, 2020",AEVQN5XRWKZRQAYNJNHKACH5EOMQ,True,...,False,False,False,False,False,False,False,False,False,False
7,BeatsPillPlus,R3V4RSDVPBJJ2S,B016QXV2QK,4.0 out of 5 stars Good quality for the price,Brian E.,4,My amazon order got lost in shipping but I did...,"Reviewed in the United States May 18, 2020",AHADPON2ZSWLZQVKNVSLQT4CTIVQ,True,...,False,False,False,False,False,False,False,False,False,False
8,BeatsPillPlus,R1W74YFDQTCB6T,B016QXV2QK,5.0 out of 5 stars Big sound in a small package.,JW,5,I gave this to my wife as a gift. She uses it ...,"Reviewed in the United States May 25, 2021",AEIMAOS463R6YKKYW4BRJN6QHOPA,True,...,False,False,False,False,False,False,False,False,False,False
9,BeatsPillPlus,R2GUWI7S3U827X,B016QXV2QK,5.0 out of 5 stars Great product will never no...,Nathaniel Pratz,5,I adore the speaker I missed the boat on the o...,"Reviewed in the United States January 13, 2021",AFMGLCE2HGVWXCBV4FZPTSWC5DBA,True,...,False,False,False,False,False,False,False,False,False,False
12,BeatsPillPlus,R2MC4WA517L588,B016QXV2QK,4.0 out of 5 stars Volume control was frustrating,Vanessa Miller,4,"As far as sound quality goes, it was awesome. ...","Reviewed in the United States March 18, 2021",AHPHSHJCFQLGFMPZL4FTBPDD2DPA,True,...,False,False,False,False,False,False,False,False,False,False
15,BeatsPillPlus,R17SRT7SDJ0N39,B016QXV2QK,5.0 out of 5 stars Just a great sound.,C2v,5,This is such an improvement over the first gen...,"Reviewed in the United States May 23, 2020",AF3IWMQNRNM6FM7YWB34ICA4CLAQ,True,...,False,False,False,False,False,False,False,False,False,False
18,BeatsPillPlus,R304J1D1XF5RHD,B016QXV2QK,4.0 out of 5 stars Good portable speaker\charg...,Pedram Salamati,4,I had a BoseMini that stoped working after 3 y...,"Reviewed in the United States October 17, 2020",AHR7CLNVBOXGOE7JC4DIWLQIU63Q,True,...,False,False,False,False,False,False,False,False,False,False
19,BeatsPillPlus,R37049Q2735C1J,B016QXV2QK,5.0 out of 5 stars Works as Advertised,Tom Waldron,5,This portable Bluetooth speaker is a dream. I ...,"Reviewed in the United States April 29, 2021",AFT6BN5ANN4NJHMFOW6IVAD2CDRA,True,...,False,False,False,False,False,False,False,False,False,False
20,BeatsPillPlus,RBTD3MTBM5K4H,B016QXV2QK,4.0 out of 5 stars Great speaker!,abraham,4,I use this EVERYWHERE from camping to car wash...,"Reviewed in the United States May 7, 2022",AFDT3MFHWQKUPTMGB26GE2BNIJVA,True,...,False,False,False,False,False,False,False,False,False,False
21,BeatsPillPlus,R3HFA8FJHYBJMI,B016QXV2QK,"5.0 out of 5 stars Loud, clear, no lag",kimberly,5,I like LOUD and this is the smallest speaker I...,"Reviewed in the United States June 21, 2020",AEP4JK6UCHUJHJCPUPFBKBON4C6A,True,...,False,False,False,False,False,False,False,False,False,False


In [23]:
reviews_df.describe()

Unnamed: 0,rating,helpful_count
count,593.0,593.0
mean,4.809444,0.0
std,0.44164,0.0
min,1.0,0.0
25%,5.0,0.0
50%,5.0,0.0
75%,5.0,0.0
max,5.0,0.0


In [24]:
reviews_df.to_csv('cleaned_amazon_reviews.csv', index = False)