In [133]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [137]:
def missing_zero_values_table(df):
    zero_val = (df == 0.00).astype(int).sum(axis=0)
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
    mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
    mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
    mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
    mz_table['Data Type'] = df.dtypes
    mz_table = mz_table[
    mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) + " columns that have missing values.")
    
    return mz_table

In [2]:
videogames = pd.read_json('Data/meta_Video_Games.json', lines=True)
reviews = pd.read_json('Data/Video_Games.json', lines=True)

## Cleaning Video Games DF

In [45]:
#drop also_buy, also_view, date, details, similar_item, tech1, tech2
main_videogames = videogames.copy()
main_videogames.drop(columns=['also_buy', 'also_view', 'date', 'details', 'similar_item', 'tech1', 'tech2'], inplace=True)

In [170]:
missing_zero_values_table(main_videogames)

Your selected dataframe has 10 columns and 84893 Rows.
There are 9 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
price,0,69338,81.7,69338,81.7,object
feature,0,22961,27.0,22961,27.0,object
image,0,22889,27.0,22889,27.0,object
description,0,10170,12.0,10170,12.0,object
rank,0,4510,5.3,4510,5.3,object
brand,0,4030,4.7,4030,4.7,object
category,0,2426,2.9,2426,2.9,object
main_cat,0,166,0.2,166,0.2,object
title,0,14,0.0,14,0.0,object


In [174]:
main_videogames[main_videogames['title'].isna()]

Unnamed: 0,asin,brand,category,description,feature,image,main_cat,price,rank,title
467,B00000K3DK,L'Arc-En-Ciel,"[Video Games, PC, Games]",[Real time strategy. Ply on definitive battle...,,,,,"314,739inCDsVinyl(",
11873,B0001RBMG8,,,,,,Video Games,,"[>#179,566 in Video Games (See Top 100 in Vide...",
11876,B0001RBMGI,,,,,[https://images-na.ssl-images-amazon.com/image...,Video Games,$143.85,"[>#123,845 in Video Games (See Top 100 in Vide...",
11878,B0001RBMA4,SEGA,,,,[https://images-na.ssl-images-amazon.com/image...,Video Games,$38.05,"[>#141,914 in Video Games (See Top 100 in Vide...",
13376,B00000K3DK,L'Arc-En-Ciel,"[Video Games, PC, Games]",[Real time strategy. Ply on definitive battle...,,,,,"314,739inCDsVinyl(",
24782,B0001RBMG8,,,,,,Video Games,,"[>#179,566 in Video Games (See Top 100 in Vide...",
24785,B0001RBMGI,,,,,[https://images-na.ssl-images-amazon.com/image...,Video Games,$143.85,"[>#123,845 in Video Games (See Top 100 in Vide...",
24787,B0001RBMA4,SEGA,,,,[https://images-na.ssl-images-amazon.com/image...,Video Games,$38.05,"[>#141,914 in Video Games (See Top 100 in Vide...",
29674,B000BKUT2S,,,,,[https://images-na.ssl-images-amazon.com/image...,Video Games,$144.40,"[>#110,146 in Video Games (See Top 100 in Vide...",
38379,B0015RCVRM,Laurie Anderson,"[Video Games, Mac, Games]",[CD-Rom for MacIntosh.],,[https://images-na.ssl-images-amazon.com/image...,,$8.06,"773,718inCDsVinyl(",


### Changing missing title names to their actual titles

In [184]:
indexes = [467, 11873, 11876, 11878, 13376, 24782, 24785, 24787, 29674, 38379, 52373, 61916, 63263, 64453]
title_names = ['Dune 2000', 'Virtual Stick Pro', 'Saturn Control Pad Mist Gray', 'Saturn control pad', 'Dune 2000', 
               'Virtual Stick Pro', 'Saturn Control Pad Mist Gray', 'Saturn control pad', 'Sega Vertual Stick', 'Puppet Motel',
               'Friendship Collection New Life','Ranch Story Connected New World', 'Kirby Triple Deluxe', 'Detective Conan Phantom Rhapsody']

In [191]:
for idx, val in enumerate(indexes):
    main_videogames.loc[val, 'title'] = title_names[idx]

In [192]:
missing_zero_values_table(main_videogames)

Your selected dataframe has 10 columns and 84893 Rows.
There are 8 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
price,0,69338,81.7,69338,81.7,object
feature,0,22961,27.0,22961,27.0,object
image,0,22889,27.0,22889,27.0,object
description,0,10170,12.0,10170,12.0,object
rank,0,4510,5.3,4510,5.3,object
brand,0,4030,4.7,4030,4.7,object
category,0,2426,2.9,2426,2.9,object
main_cat,0,166,0.2,166,0.2,object


# Cleaning Reviews DF

In [116]:
#change image to 0 or 1
main_reviews = reviews.copy()
image = [0 if isinstance(i,float) else 1 for i in main_reviews.image]
main_reviews['image'] = image

In [128]:
#drop columns = reviewTime, reviewName, style, vote
main_reviews.drop(columns=['reviewTime', 'reviewerName', 'style', 'vote'], inplace=True)

### Handle Missing

In [138]:
missing_zero_values_table(main_reviews)

Your selected dataframe has 8 columns and 2565349 Rows.
There are 2 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
reviewText,0,1715,0.1,1715,0.1,object
summary,0,811,0.0,811,0.0,object


In [144]:
np.unique(reviews.reviewText.isna(), return_counts=True)

(array([False,  True]), array([2563634,    1715]))

In [150]:
np.unique(reviews.summary.isna(), return_counts=True)

(array([False,  True]), array([2564538,     811]))

In [162]:
reviewText = main_reviews.reviewText.apply(lambda x: 'no_text_was_given' if isinstance(x, float) else x)

In [166]:
summary = main_reviews.summary.apply(lambda x: 'no_summary_given' if isinstance(x, float) else x)

In [168]:
main_reviews['reviewText'] = reviewText
main_reviews['summary'] = summary

In [169]:
missing_zero_values_table(main_reviews)

Your selected dataframe has 8 columns and 2565349 Rows.
There are 0 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type


##### Done handling missing values