# Data preprocessing notebook
* load raw json format
* clean dataset
* output to csv

In [1]:
import json
from tqdm import tqdm
import pandas as pd
import os

import html
import re 
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
os.path.getsize("Electronics.json")

11508470738

In [3]:
os.path.getsize("meta_Electronics.json")

10995631365

## metadata dataset (product data)

In [4]:
def get_row_counts(file_path):
    with open(file_path, "r") as file:
        row_counts = 0
        for rows in file:
            row_counts += 1
    return row_counts

def load_json_to_df(file_path, partition = 1):
    row_counts = get_row_counts(file_path)
    with open(file_path, "r") as file:
        # start from first partition
        rows_rolling = 0
        json_object = []
        for rows in tqdm(file):
            if rows_rolling > row_counts / partition:
                print(f"Total rows processed {rows_rolling}/{row_counts}")
                break
            else:
                json_obj = json.loads(rows.strip())
                json_object.append(json_obj)
                rows_rolling += 1
                if rows_rolling % 100000 == 0:
                    print(f"Processing row: {rows_rolling}/{row_counts}")
                if rows_rolling == row_counts:
                    print(f"All rows loaded to json objects: {rows_rolling}/{row_counts}")
                    
    print("pushing json objects to pd.df")
    return pd.DataFrame(json_object)

In [6]:
df_product = load_json_to_df(file_path="meta_Electronics.json", partition = 1)

105339it [00:03, 36742.15it/s]

Processing row: 100000/786445


205085it [00:06, 36533.10it/s]

Processing row: 200000/786445


306979it [00:10, 29539.64it/s]

Processing row: 300000/786445


405943it [00:14, 32039.04it/s]

Processing row: 400000/786445


505660it [00:19, 30119.01it/s]

Processing row: 500000/786445


603418it [00:25, 24126.34it/s]

Processing row: 600000/786445


703363it [00:29, 26663.08it/s]

Processing row: 700000/786445


786445it [00:40, 19235.35it/s]


All rows loaded to json objects: 786445/786445
pushing json objects to pd.df


In [7]:
df_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786445 entries, 0 to 786444
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   category         786445 non-null  object
 1   tech1            786445 non-null  object
 2   description      786445 non-null  object
 3   fit              786445 non-null  object
 4   title            786445 non-null  object
 5   also_buy         786445 non-null  object
 6   tech2            786445 non-null  object
 7   brand            786445 non-null  object
 8   feature          786445 non-null  object
 9   rank             786445 non-null  object
 10  also_view        786445 non-null  object
 11  main_cat         786445 non-null  object
 12  similar_item     786445 non-null  object
 13  date             786445 non-null  object
 14  price            786445 non-null  object
 15  asin             786445 non-null  object
 16  imageURL         786445 non-null  object
 17  imageURLHi

### df_product: data definition and keep/drop rule
* category: product category and sub-catecory information - *REDUNDANT, DROP*
* tech1: primary product specification informtion in HTML format - *REDUNDANT, DROP*
* description: product description - *IMPORTANT, MUST KEEP*
* fit: this column contains wearable electronic product information in HTML format - *REDUNDANT, DROP*
* title: product title - *IMPORTANT, MUST KEEP*
* also_buy: list of asin of product which customer who bought this also bought - *REDUNDANT, CONSIDER DROPPING*
* tech2: secondary product specification informtion in HTML format - *REDUNDANT, DROP*
* brand: brand of product - *IMPORTANT, MUST KEEP*
* feature: product features - *REDUNDANT, CONSIDER DROPPING*
* rank: product rank, need to be clearned as formating is not unified - *IMPORTANT, TO BE CLEANED, MUST KEEP*
* also_view: list of asin of product which customer who viewed this also viewed - *REDUNDANT, DROP*
* main_cat: main product category - *IMPORTANT, MUST KEEP*
* similar_item: similar item, html content - *REDUNDANT, DROP*
* date: product date - *IMPORTANT, MUST KEEP*
* price: price information - *IMPORTANT, MUST KEEP*
* asin: product id - *IMPORTANT, MUST KEEP*
* imageURL: URL of product image - *REDUNDANT, DROP*
* imageURLHighRes: URL of product image - *REDUNDANT, DROP*
* details: - *REDUNDANT, DROP*


In [8]:
columns_to_drop = [
    'category', 'tech1', 'fit', 'also_buy', 'tech2', 
    'feature', 'also_view', 'similar_item', 'imageURL', 
    'imageURLHighRes', 'details'
    ]
# Dropping the redundant columns
df_product = df_product.drop(columns=columns_to_drop)

In [9]:
df_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786445 entries, 0 to 786444
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   description  786445 non-null  object
 1   title        786445 non-null  object
 2   brand        786445 non-null  object
 3   rank         786445 non-null  object
 4   main_cat     786445 non-null  object
 5   date         786445 non-null  object
 6   price        786445 non-null  object
 7   asin         786445 non-null  object
dtypes: object(8)
memory usage: 48.0+ MB


## Data Clearning

In [10]:
## Define function for description and value 

def clean_description(description):
    # Check if the input is a list, and convert it to a string if so
    if isinstance(description, list):
        # Join the list elements into a single string separated by spaces
        description = ' '.join(description)
    
    description = html.unescape(description)
    # Remove HTML tags
    description = re.sub('<.*?>', ' ', description)
    # Replace non-alphanumeric characters with spaces
    description = re.sub('[^0-9a-zA-Z]+', ' ', description)
    # Normalise whitespace to single space
    description = re.sub('\s+', ' ', description)
    # Trim leading and trailing spaces
    description = description.strip()
    # Convert to lowercase
    description = description.lower()
    
    return description


def clean_main_cat(main_cat):
    # Convert any HTML entities to their corresponding characters
    main_cat = html.unescape(main_cat)
    # amazon fashion category is compromised contain url so normalise it to just amazon fashion
    main_cat = main_cat.replace('AMAZON FASHION', 'Amazon Fashion')
    
    return main_cat

In [11]:
## Start cleaning

# Removing all $ signs from the 'price' column
df_product['price'] = df_product['price'].str.replace('$', '', regex=False)

# Convert price column to correct dtype to save memory
df_product['price'] = pd.to_numeric(df_product['price'], errors='coerce')

# Clean description and value
df_product['description'] = df_product['description'].apply(clean_description)
df_product['main_cat'] = df_product['main_cat'].apply(clean_main_cat)



In [12]:
df_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786445 entries, 0 to 786444
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   description  786445 non-null  object 
 1   title        786445 non-null  object 
 2   brand        786445 non-null  object 
 3   rank         786445 non-null  object 
 4   main_cat     786445 non-null  object 
 5   date         786445 non-null  object 
 6   price        303289 non-null  float64
 7   asin         786445 non-null  object 
dtypes: float64(1), object(7)
memory usage: 48.0+ MB


In [13]:
# drop duplicated rows
df_product = df_product.drop_duplicates(subset=['asin'])


In [14]:
df_product.info()


<class 'pandas.core.frame.DataFrame'>
Index: 756077 entries, 0 to 786444
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   description  756077 non-null  object 
 1   title        756077 non-null  object 
 2   brand        756077 non-null  object 
 3   rank         756077 non-null  object 
 4   main_cat     756077 non-null  object 
 5   date         756077 non-null  object 
 6   price        293928 non-null  float64
 7   asin         756077 non-null  object 
dtypes: float64(1), object(7)
memory usage: 51.9+ MB


Some columns were delibertly left out to maintain the integrity of column asin
* description - cleaned to just alphanumeric text
* title - cleaned
* brand - cleaned
* rank - untouched, contain multiple category rankings
* main_cat - cleaned
* date - untouched, some dates are still compromise
* price - cleaned
* asin - cleaned

In [15]:
# output to CSV
df_product.to_csv('product_cleaned.csv', index=False)

# main dataset

In [5]:
df_main = load_json_to_df(file_path="Electronics.json", partition = 1)

138423it [00:00, 290788.60it/s]

Processing row: 100000/20994353


245800it [00:00, 287487.79it/s]

Processing row: 200000/20994353


348533it [00:01, 282360.77it/s]

Processing row: 300000/20994353


382894it [00:01, 298406.71it/s]

Processing row: 400000/20994353


557139it [00:02, 308385.45it/s]

Processing row: 500000/20994353


664951it [00:02, 263670.67it/s]

Processing row: 600000/20994353


768923it [00:02, 311955.32it/s]

Processing row: 700000/20994353


836613it [00:03, 206702.86it/s]

Processing row: 800000/20994353


939537it [00:03, 279721.98it/s]

Processing row: 900000/20994353


1009359it [00:03, 312725.05it/s]

Processing row: 1000000/20994353


1142627it [00:04, 238751.46it/s]

Processing row: 1100000/20994353


1248787it [00:04, 305691.36it/s]

Processing row: 1200000/20994353


1321992it [00:05, 334309.69it/s]

Processing row: 1300000/20994353


1463049it [00:05, 232998.45it/s]

Processing row: 1400000/20994353


1567617it [00:06, 297759.43it/s]

Processing row: 1500000/20994353


1638751it [00:06, 325202.49it/s]

Processing row: 1600000/20994353


1744628it [00:06, 340785.41it/s]

Processing row: 1700000/20994353


1849789it [00:07, 150951.74it/s]

Processing row: 1800000/20994353


1951117it [00:08, 233417.07it/s]

Processing row: 1900000/20994353


2051292it [00:08, 290786.50it/s]

Processing row: 2000000/20994353


2153597it [00:08, 322018.17it/s]

Processing row: 2100000/20994353


2256060it [00:08, 334531.55it/s]

Processing row: 2200000/20994353


2290946it [00:09, 338758.97it/s]

Processing row: 2300000/20994353


2457306it [00:10, 190017.25it/s]

Processing row: 2400000/20994353


2558170it [00:10, 268060.71it/s]

Processing row: 2500000/20994353


2664820it [00:11, 323517.40it/s]

Processing row: 2600000/20994353


2734778it [00:11, 331855.82it/s]

Processing row: 2700000/20994353


2839753it [00:11, 338380.35it/s]

Processing row: 2800000/20994353


2908256it [00:11, 335477.48it/s]

Processing row: 2900000/20994353


3052055it [00:13, 156435.27it/s]

Processing row: 3000000/20994353


3152695it [00:13, 233570.36it/s]

Processing row: 3100000/20994353


3252552it [00:14, 284909.12it/s]

Processing row: 3200000/20994353


3353305it [00:14, 309698.91it/s]

Processing row: 3300000/20994353


3452969it [00:14, 324244.55it/s]

Processing row: 3400000/20994353


3551986it [00:15, 322997.78it/s]

Processing row: 3500000/20994353


3646135it [00:15, 305845.96it/s]

Processing row: 3600000/20994353


3741691it [00:17, 76516.37it/s] 

Processing row: 3700000/20994353


3833546it [00:17, 149823.09it/s]

Processing row: 3800000/20994353


3956789it [00:17, 245321.36it/s]

Processing row: 3900000/20994353


4053680it [00:18, 288988.92it/s]

Processing row: 4000000/20994353


4153077it [00:18, 316313.18it/s]

Processing row: 4100000/20994353


4249457it [00:18, 310609.32it/s]

Processing row: 4200000/20994353


4349563it [00:19, 318082.48it/s]

Processing row: 4300000/20994353


4449272it [00:19, 327528.65it/s]

Processing row: 4400000/20994353


4554835it [00:19, 338805.60it/s]

Processing row: 4500000/20994353


4588771it [00:19, 332859.20it/s]

Processing row: 4600000/20994353


4726802it [00:27, 46939.30it/s] 

Processing row: 4700000/20994353


4844911it [00:27, 137010.94it/s]

Processing row: 4800000/20994353


4945348it [00:27, 228628.11it/s]

Processing row: 4900000/20994353


5048423it [00:28, 292546.64it/s]

Processing row: 5000000/20994353


5148157it [00:28, 313150.71it/s]

Processing row: 5100000/20994353


5250897it [00:28, 331437.46it/s]

Processing row: 5200000/20994353


5353391it [00:29, 329520.33it/s]

Processing row: 5300000/20994353


5458694it [00:29, 337195.88it/s]

Processing row: 5400000/20994353


5559472it [00:29, 331913.91it/s]

Processing row: 5500000/20994353


5658253it [00:29, 324835.31it/s]

Processing row: 5600000/20994353


5694037it [00:43, 334189.22it/s]

Processing row: 5700000/20994353


5838981it [00:44, 29880.93it/s] 

Processing row: 5800000/20994353


5937756it [00:44, 74512.33it/s]

Processing row: 5900000/20994353


6035150it [00:44, 150115.51it/s]

Processing row: 6000000/20994353


6141238it [00:45, 245196.75it/s]

Processing row: 6100000/20994353


6243059it [00:45, 291848.74it/s]

Processing row: 6200000/20994353


6346081it [00:45, 324913.74it/s]

Processing row: 6300000/20994353


6447809it [00:45, 323862.89it/s]

Processing row: 6400000/20994353


6549737it [00:46, 333952.06it/s]

Processing row: 6500000/20994353


6656071it [00:46, 340324.08it/s]

Processing row: 6600000/20994353


6762370it [00:46, 347816.04it/s]

Processing row: 6700000/20994353


6873619it [00:47, 363245.64it/s]

Processing row: 6800000/20994353


6945603it [00:47, 341721.86it/s]

Processing row: 6900000/20994353


7047286it [00:47, 330663.91it/s]

Processing row: 7000000/20994353


7114753it [00:47, 332090.31it/s]

Processing row: 7100000/20994353


7260177it [01:09, 19509.96it/s] 

Processing row: 7200000/20994353


7366759it [01:09, 55547.73it/s]

Processing row: 7300000/20994353


7433830it [01:09, 95806.38it/s]

Processing row: 7400000/20994353


7540820it [01:09, 186840.19it/s]

Processing row: 7500000/20994353


7650569it [01:10, 273322.37it/s]

Processing row: 7600000/20994353


7759820it [01:10, 322933.92it/s]

Processing row: 7700000/20994353


7865211it [01:10, 334038.04it/s]

Processing row: 7800000/20994353


7936703it [01:11, 341199.71it/s]

Processing row: 7900000/20994353


8043521it [01:11, 351961.84it/s]

Processing row: 8000000/20994353


8151168it [01:11, 355014.08it/s]

Processing row: 8100000/20994353


8257176it [01:12, 344106.32it/s]

Processing row: 8200000/20994353


8363904it [01:12, 351819.58it/s]

Processing row: 8300000/20994353


8434178it [01:12, 346006.29it/s]

Processing row: 8400000/20994353


8542317it [01:12, 351089.50it/s]

Processing row: 8500000/20994353


8652149it [01:13, 362829.70it/s]

Processing row: 8600000/20994353


8761728it [01:13, 362796.06it/s]

Processing row: 8700000/20994353


8798018it [01:44, 353890.07it/s]

Processing row: 8800000/20994353


8951626it [01:44, 19327.12it/s] 

Processing row: 8900000/20994353


9052524it [01:44, 52635.51it/s]

Processing row: 9000000/20994353


9162225it [01:45, 124402.88it/s]

Processing row: 9100000/20994353


9266604it [01:45, 213666.39it/s]

Processing row: 9200000/20994353


9337165it [01:45, 266017.89it/s]

Processing row: 9300000/20994353


9448198it [01:46, 327164.08it/s]

Processing row: 9400000/20994353


9557004it [01:46, 348737.38it/s]

Processing row: 9500000/20994353


9669506it [01:46, 358117.95it/s]

Processing row: 9600000/20994353


9741153it [01:46, 345653.13it/s]

Processing row: 9700000/20994353


9849091it [01:47, 353371.70it/s]

Processing row: 9800000/20994353


9957226it [01:47, 354427.95it/s]

Processing row: 9900000/20994353


10067955it [01:47, 361431.24it/s]

Processing row: 10000000/20994353


10141844it [01:48, 365516.54it/s]

Processing row: 10100000/20994353


10258493it [01:48, 381398.08it/s]

Processing row: 10200000/20994353


10373738it [01:48, 377946.69it/s]

Processing row: 10300000/20994353


10449183it [01:48, 372037.75it/s]

Processing row: 10400000/20994353


10559967it [01:49, 361213.93it/s]

Processing row: 10500000/20994353


10674827it [01:49, 376340.85it/s]

Processing row: 10600000/20994353


10749363it [01:49, 359195.43it/s]

Processing row: 10700000/20994353


10856980it [01:50, 350197.46it/s]

Processing row: 10800000/20994353


10929184it [01:50, 356331.78it/s]

Processing row: 10900000/20994353


11039184it [03:31, 2926.59it/s]  

Processing row: 11000000/20994353


11165062it [03:31, 12424.02it/s]

Processing row: 11100000/20994353


11270566it [03:31, 36111.45it/s]

Processing row: 11200000/20994353


11340321it [03:32, 66928.16it/s]

Processing row: 11300000/20994353


11451091it [03:32, 148929.99it/s]

Processing row: 11400000/20994353


11559732it [03:32, 241756.27it/s]

Processing row: 11500000/20994353


11672370it [03:33, 318059.15it/s]

Processing row: 11600000/20994353


11746550it [03:33, 337997.80it/s]

Processing row: 11700000/20994353


11861765it [03:33, 367218.64it/s]

Processing row: 11800000/20994353


11976010it [03:33, 375552.70it/s]

Processing row: 11900000/20994353


12051613it [03:34, 358833.49it/s]

Processing row: 12000000/20994353


12167117it [03:34, 379135.76it/s]

Processing row: 12100000/20994353


12242514it [03:34, 367060.22it/s]

Processing row: 12200000/20994353


12351708it [03:34, 358546.51it/s]

Processing row: 12300000/20994353


12462985it [03:35, 366642.95it/s]

Processing row: 12400000/20994353


12574480it [03:35, 365781.70it/s]

Processing row: 12500000/20994353


12647100it [03:35, 354989.90it/s]

Processing row: 12600000/20994353


12756596it [03:35, 363529.29it/s]

Processing row: 12700000/20994353


12867544it [03:36, 366717.68it/s]

Processing row: 12800000/20994353


12941172it [03:36, 362930.17it/s]

Processing row: 12900000/20994353


13055853it [03:36, 354719.01it/s]

Processing row: 13000000/20994353


13168839it [03:37, 361094.01it/s]

Processing row: 13100000/20994353


13240623it [03:37, 353982.37it/s]

Processing row: 13200000/20994353


13356888it [03:37, 361018.91it/s]

Processing row: 13300000/20994353


13468117it [03:37, 368144.73it/s]

Processing row: 13400000/20994353


13504974it [03:38, 355237.80it/s]

Processing row: 13500000/20994353


13631886it [06:14, 2412.15it/s]  

Processing row: 13600000/20994353


13763078it [06:14, 11367.97it/s]

Processing row: 13700000/20994353


13864411it [06:15, 32414.58it/s]

Processing row: 13800000/20994353


13967896it [06:15, 82035.55it/s]

Processing row: 13900000/20994353


14037640it [06:15, 135481.30it/s]

Processing row: 14000000/20994353


14145187it [06:15, 231213.58it/s]

Processing row: 14100000/20994353


14254866it [06:16, 306515.92it/s]

Processing row: 14200000/20994353


14365995it [06:16, 335706.57it/s]

Processing row: 14300000/20994353


14439300it [06:16, 349759.73it/s]

Processing row: 14400000/20994353


14544882it [06:17, 342149.30it/s]

Processing row: 14500000/20994353


14653801it [06:17, 357644.51it/s]

Processing row: 14600000/20994353


14764636it [06:17, 360993.94it/s]

Processing row: 14700000/20994353


14839764it [06:17, 359975.25it/s]

Processing row: 14800000/20994353


14951414it [06:18, 368685.22it/s]

Processing row: 14900000/20994353


15061423it [06:18, 357235.86it/s]

Processing row: 15000000/20994353


15169944it [06:18, 357801.02it/s]

Processing row: 15100000/20994353


15241394it [06:19, 352814.15it/s]

Processing row: 15200000/20994353


15350645it [06:19, 360856.28it/s]

Processing row: 15300000/20994353


15458772it [06:19, 353529.52it/s]

Processing row: 15400000/20994353


15566688it [06:19, 357451.62it/s]

Processing row: 15500000/20994353


15642671it [06:20, 368414.87it/s]

Processing row: 15600000/20994353


15751380it [06:20, 358540.51it/s]

Processing row: 15700000/20994353


15860454it [06:20, 359414.87it/s]

Processing row: 15800000/20994353


15967837it [06:21, 354887.59it/s]

Processing row: 15900000/20994353


16040292it [06:21, 358557.48it/s]

Processing row: 16000000/20994353


16150094it [06:21, 361062.70it/s]

Processing row: 16100000/20994353


16257507it [06:21, 349369.13it/s]

Processing row: 16200000/20994353


16361940it [06:22, 342147.15it/s]

Processing row: 16300000/20994353


16469417it [06:22, 354205.77it/s]

Processing row: 16400000/20994353


16532958it [10:23, 763.44it/s]   

Processing row: 16500000/20994353


16658548it [10:23, 3594.74it/s]

Processing row: 16600000/20994353


16762334it [10:24, 11221.58it/s]

Processing row: 16700000/20994353


16865967it [10:24, 31174.36it/s]

Processing row: 16800000/20994353


16967300it [10:24, 76343.89it/s]

Processing row: 16900000/20994353


17070886it [10:25, 157458.35it/s]

Processing row: 17000000/20994353


17142274it [10:25, 221883.64it/s]

Processing row: 17100000/20994353


17247391it [10:25, 290221.07it/s]

Processing row: 17200000/20994353


17353239it [10:25, 326337.14it/s]

Processing row: 17300000/20994353


17457826it [10:26, 337747.76it/s]

Processing row: 17400000/20994353


17566728it [10:26, 354111.31it/s]

Processing row: 17500000/20994353


17637944it [10:26, 344280.94it/s]

Processing row: 17600000/20994353


17741381it [10:27, 335871.42it/s]

Processing row: 17700000/20994353


17845891it [10:27, 341159.25it/s]

Processing row: 17800000/20994353


17954983it [10:27, 349571.60it/s]

Processing row: 17900000/20994353


18060930it [10:27, 344264.45it/s]

Processing row: 18000000/20994353


18168026it [10:28, 352150.18it/s]

Processing row: 18100000/20994353


18240565it [10:28, 358306.65it/s]

Processing row: 18200000/20994353


18349780it [10:28, 361378.53it/s]

Processing row: 18300000/20994353


18461577it [10:29, 367651.75it/s]

Processing row: 18400000/20994353


18539867it [10:29, 379434.51it/s]

Processing row: 18500000/20994353


18655192it [10:29, 378193.04it/s]

Processing row: 18600000/20994353


18770603it [10:29, 380948.67it/s]

Processing row: 18700000/20994353


18849062it [10:30, 386207.74it/s]

Processing row: 18800000/20994353


18965782it [10:30, 386258.01it/s]

Processing row: 18900000/20994353


19044056it [10:30, 389122.41it/s]

Processing row: 19000000/20994353


19159800it [10:30, 380962.68it/s]

Processing row: 19100000/20994353


19280754it [10:31, 396164.18it/s]

Processing row: 19200000/20994353


19361142it [10:31, 398095.69it/s]

Processing row: 19300000/20994353


19442423it [10:31, 402676.42it/s]

Processing row: 19400000/20994353


19564326it [10:31, 401457.94it/s]

Processing row: 19500000/20994353


19646233it [10:32, 405793.24it/s]

Processing row: 19600000/20994353


19770564it [10:32, 411759.31it/s]

Processing row: 19700000/20994353


19853406it [10:32, 413062.31it/s]

Processing row: 19800000/20994353


19975456it [10:32, 398514.21it/s]

Processing row: 19900000/20994353


20055188it [10:33, 393661.08it/s]

Processing row: 20000000/20994353


20173993it [10:33, 389652.55it/s]

Processing row: 20100000/20994353


20251607it [10:33, 382806.14it/s]

Processing row: 20200000/20994353


20370150it [10:33, 391660.84it/s]

Processing row: 20300000/20994353


20447837it [10:34, 382055.76it/s]

Processing row: 20400000/20994353


20563614it [10:34, 380969.99it/s]

Processing row: 20500000/20994353


20678345it [10:34, 380099.86it/s]

Processing row: 20600000/20994353


20757280it [10:34, 387990.99it/s]

Processing row: 20700000/20994353


20875376it [10:35, 388382.20it/s]

Processing row: 20800000/20994353


20952701it [10:35, 378412.63it/s]

Processing row: 20900000/20994353


20994353it [10:35, 33033.31it/s] 


All rows loaded to json objects: 20994353/20994353
pushing json objects to pd.df


In [6]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20994353 entries, 0 to 20994352
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   overall         float64
 1   verified        bool   
 2   reviewTime      object 
 3   reviewerID      object 
 4   asin            object 
 5   style           object 
 6   reviewerName    object 
 7   reviewText      object 
 8   summary         object 
 9   unixReviewTime  int64  
 10  vote            object 
 11  image           object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 1.7+ GB


In [7]:
df_main.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"07 17, 2002",A1N070NS9CJQ2I,60009810,{'Format:': ' Hardcover'},Teri Adams,This was the first time I read Garcia-Aguilera...,Hit The Spot!,1026864000,,
1,5.0,False,"07 6, 2002",A3P0KRKOBQK1KN,60009810,{'Format:': ' Hardcover'},Willa C.,"As with all of Ms. Garcia-Aguilera's books, I ...",one hot summer is HOT HOT HOT!,1025913600,,
2,5.0,False,"07 3, 2002",A192HO2ICJ75VU,60009810,{'Format:': ' Hardcover'},Kit,I've not read any of Ms Aguilera's works befor...,One Hot Summer,1025654400,2.0,
3,4.0,False,"06 30, 2002",A2T278FKFL3BLT,60009810,{'Format:': ' Hardcover'},Andres,This romance novel is right up there with the ...,I love this book!,1025395200,3.0,
4,5.0,False,"06 28, 2002",A2ZUXVTW8RXBXW,60009810,{'Format:': ' Hardcover'},John,Carolina Garcia Aguilera has done it again. S...,One Hot Book,1025222400,,


### df_main: data definition and keep/drop rule
* overall: rating of the product                                               - `*IMPORTANT, MUST KEEP*`
* verified: boolean                                                            - `*IMPORTANT, MUST KEEP*`
* reviewTime: time of the review (raw)                                         - `*IMPORTANT, MUST KEEP*`
* reviewerID: ID of the reviewer, e.g. A2SUAM1J3GNN3B                          - `*REDUNDANT, DROP*`
* asin: ID of the product, e.g. 0000013714                                     - `*IMPORTANT, MUST KEEP*`
* style: a dictionary of the product metadata, e.g., "Format" is "Hardcover"   - `*REDUNDANT, DROP*`
* reviewerName: name of the reviewer                                           - `*REDUNDANT, DROP*`
* reviewText: text of the review                                               - `*IMPORTANT, MUST KEEP*`
* summary: summary of the review                                               - `*DROP*`
* unixReviewTime: time of the review (Unix time format), e.g. 1026864000	   - `*DROP*`
* vote: helpful votes of the review, e.g. NaN or int                           - `*KEEP*`
* image: images that users post after they have received the product           - `*KEEP*`

In [8]:
columns_to_drop_main = [
    'reviewerID', 'style', 'reviewerName', 'summary', 'unixReviewTime'
    ]
# Dropping the redundant columns
df_main = df_main.drop(columns=columns_to_drop_main)
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20994353 entries, 0 to 20994352
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   overall     float64
 1   verified    bool   
 2   reviewTime  object 
 3   asin        object 
 4   reviewText  object 
 5   vote        object 
 6   image       object 
dtypes: bool(1), float64(1), object(5)
memory usage: 981.1+ MB


In [9]:
# convert image url to boolean
df_main['image'] = df_main['image'].notna() & (df_main['image'] != '')
df_main['image'].value_counts()

image
False    20645630
True       348723
Name: count, dtype: int64

In [10]:
# convert data type
df_main['reviewTime'] = pd.to_datetime(df_main['reviewTime'], format='%m %d, %Y')

In [11]:
# drop duplicated rows
df_main = df_main.drop_duplicates()

In [12]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20576637 entries, 0 to 20994352
Data columns (total 7 columns):
 #   Column      Dtype         
---  ------      -----         
 0   overall     float64       
 1   verified    bool          
 2   reviewTime  datetime64[ns]
 3   asin        object        
 4   reviewText  object        
 5   vote        object        
 6   image       bool          
dtypes: bool(2), datetime64[ns](1), float64(1), object(3)
memory usage: 981.2+ MB


In [13]:
df_main.head()

Unnamed: 0,overall,verified,reviewTime,asin,reviewText,vote,image
0,5.0,True,2002-07-17,60009810,This was the first time I read Garcia-Aguilera...,,False
1,5.0,False,2002-07-06,60009810,"As with all of Ms. Garcia-Aguilera's books, I ...",,False
2,5.0,False,2002-07-03,60009810,I've not read any of Ms Aguilera's works befor...,2.0,False
3,4.0,False,2002-06-30,60009810,This romance novel is right up there with the ...,3.0,False
4,5.0,False,2002-06-28,60009810,Carolina Garcia Aguilera has done it again. S...,,False


In [15]:
# output to CSV
df_main.to_csv('review_cleaned.csv', index=False)