In [133]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [137]:
def missing_zero_values_table(df):
    zero_val = (df == 0.00).astype(int).sum(axis=0)
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
    mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
    mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
    mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
    mz_table['Data Type'] = df.dtypes
    mz_table = mz_table[
    mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) + " columns that have missing values.")
    
    return mz_table

In [2]:
videogames = pd.read_json('Data/meta_Video_Games.json', lines=True)
reviews = pd.read_json('Data/Video_Games.json', lines=True)

## Cleaning Video Games DF

In [40]:
videogames.head(2)

Unnamed: 0,also_buy,also_view,asin,brand,category,date,description,details,feature,image,main_cat,price,rank,similar_item,tech1,tech2,title
0,,,42000742,Fidelity Electronics,"[Video Games, PC, Games]",,,,,[https://images-na.ssl-images-amazon.com/image...,Toys & Games,,">#2,623,937 in Toys & Games (See Top 100 in To...",,,,Reversi Sensory Challenger
1,[B00PADROYW],"[B0050SY5BM, B072NQJCW5, B000TI836G, B002SRSQ7...",78764343,by\n \n EA Games,"[Video Games, Xbox 360, Games, ]",,[Brand new sealed!],,,[https://images-na.ssl-images-amazon.com/image...,Video Games,,">#67,231 in Video Games (See Top 100 in Video ...",,,,Medal of Honor: Warfighter - Includes Battlefi...


In [41]:
videogames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84893 entries, 0 to 84892
Data columns (total 17 columns):
also_buy        35420 non-null object
also_view       45081 non-null object
asin            84893 non-null object
brand           80863 non-null object
category        82467 non-null object
date            8273 non-null object
description     74723 non-null object
details         2261 non-null object
feature         61932 non-null object
image           62004 non-null object
main_cat        84727 non-null object
price           15555 non-null object
rank            80383 non-null object
similar_item    281 non-null object
tech1           2847 non-null object
tech2           234 non-null object
title           84879 non-null object
dtypes: object(17)
memory usage: 11.0+ MB


In [45]:
#drop also_buy, also_view, date, details, similar_item, tech1, tech2
main_videogames = videogames.copy()
main_videogames.drop(columns=['also_buy', 'also_view', 'date', 'details', 'similar_item', 'tech1', 'tech2'], inplace=True)

In [53]:
main_videogames[main_videogames['price'].notna()].head()

Unnamed: 0,asin,brand,category,description,feature,image,main_cat,price,rank,title
2,0276425316,Nintendo,"[Video Games, Retro Gaming & Microconsoles, Su...",,,,Video Games,$0.72,">#134,433 in Video Games (See Top 100 in Video...",street fighter 2 II turbo super nintendo snes ...
8,043933702X,Electronic Arts,"[Video Games, PC, Games]",[video game],[video game],[https://images-na.ssl-images-amazon.com/image...,Video Games,$49.98,">#51,505 in Video Games (See Top 100 in Video ...",Need for Speed Porsche Unleashed
13,043940133X,Star Wars,"[Video Games, PC, Games]",[Product Description Your ship has crash-lande...,[Play challenging math games against Jabba the...,[https://images-na.ssl-images-amazon.com/image...,Video Games,$24.00,">#56,730 in Video Games (See Top 100 in Video ...",Star Wars Math: Jabba's Game Galaxy
38,0700026657,Ubisoft,"[Video Games, PC, Games]",[ANNO 2070BRAND NEW - IN STOCKDVD Rom Software...,[A new era: while adhering to the fundamentals...,[https://images-na.ssl-images-amazon.com/image...,Video Games,$7.95,">#30,230 in Video Games (See Top 100 in Video ...",Anno 2070
43,0763030945,,"[Video Games, PC, Games]",[Barbie Story Maker for the PC(CD-ROM)\nWindow...,,,Books,$1.90,"17,209,518inBooks(",Barbie Storymaker Cd Rom


## Cleaning Reviews DF

In [116]:
#change image to 0 or 1
main_reviews = reviews.copy()
image = [0 if isinstance(i,float) else 1 for i in main_reviews.image]
main_reviews['image'] = image

In [128]:
#drop columns = reviewTime, reviewName, style, vote
main_reviews.drop(columns=['reviewTime', 'reviewerName', 'style', 'vote'], inplace=True)

In [138]:
missing_zero_values_table(main_reviews)

Your selected dataframe has 8 columns and 2565349 Rows.
There are 2 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
reviewText,0,1715,0.1,1715,0.1,object
summary,0,811,0.0,811,0.0,object


### Handle Missing

In [144]:
np.unique(reviews.reviewText.isna(), return_counts=True)

(array([False,  True]), array([2563634,    1715]))

In [148]:
main_reviews[main_reviews['reviewText'].isna()].head()

Unnamed: 0,asin,image,overall,reviewText,reviewerID,summary,unixReviewTime,verified
2544,B000006RGS,1,1,,A38ZF2TRJGFMAL,The game have scratches and is not working in ...,1441324800,True
9169,B00000J9J9,1,1,,AWBMGHN0VG6L,Received a Bootleg!,1510704000,True
9398,B00000J97G,1,3,,AO2JXTWOS697B,Three Stars,1475107200,True
24008,B00002DHEV,1,1,,A1XULD000L7VRT,Ravenholm Elecronics. Doesnt work.,1498521600,True
24057,B00002DHEV,1,1,,A1S3BZ3VJ4WQ8S,$adboy$ x 2001,1487894400,False


In [149]:
main_reviews[main_reviews['summary'].isna()].head()

Unnamed: 0,asin,image,overall,reviewText,reviewerID,summary,unixReviewTime,verified
4113,B00000DMAX,0,4,Another great one for 64.,A3SBZQJ2AUD2GY,,1462406400,True
12942,B00000K2R4,0,5,"First, I was saddened that Sega ended the Drea...",AQD07G9GJ61R3,,1055116800,False
19178,B00001XDTK,0,5,Ok so I reviewed this item and give nothing bu...,A3GTEIVU7JETVA,,1351555200,True
23874,B00002EIZW,0,5,"It works great, and wasn't in too bad of condi...",A3ATJZGRK3XO50,,1454630400,True
24208,B00002DHEV,0,5,"So far, worked well(just two uses in tho)",AF4YUGIEP7AXR,,1470787200,True


In [150]:
np.unique(reviews.summary.isna(), return_counts=True)

(array([False,  True]), array([2564538,     811]))

numpy.bool_