In [None]:
import numpy as np
import pandas as pd
import json 
import gzip
import random
from pandas.io.json import json_normalize 
from urllib.request import urlopen

In [None]:
#Defining a function to select random samples from each dataset
size = 5        # sample size
replace = False  # No replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]

In [None]:
# Turning json-file into pandas dataframe
data = []
with gzip.open('Grocery_and_Gourmet_Food.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))    
Groceries = pd.DataFrame.from_dict(data)

In [None]:
#Groceries metadata to pandas dataframe
data2 = []
with gzip.open('Amazon_Datasets/meta_Grocery_and_Gourmet_Food.json.gz') as f:
    for l in f:
        data2.append(json.loads(l.strip()))
Groceries_meta = pd.DataFrame.from_dict(data2)

In [None]:
Groceries_meta.to_csv('meta_groceries.csv')

In [None]:
#Merging grocery metadata with the Grocery reviews
Groceries_merged = Groceries.merge(Groceries_meta, on = 'asin', how = 'left')

In [None]:
#Selecting products in Grocery dataset where there are at least 5 reviews
b = Groceries_merged['asin'].value_counts() > 4
Grocery_reviews = list(b[b].index)

In [None]:
grocery_sample = random.sample(Grocery_reviews, k = 3000)
grocery_sample_df = Groceries_merged[Groceries_merged['asin'].isin(grocery_sample)]

In [None]:
grocery_products_df = grocery_sample_df.groupby('asin', as_index=False).apply(fn) #Taking a subsample of the group to limit reviews to 5
grocery_products_df = grocery_products_df.reset_index() 
print(all(grocery_sample_df['asin'].value_counts() >= 5))

In [None]:
grocery_products_df['vote'] = grocery_products_df['vote'].fillna(0)
grocery_products_df = grocery_products_df.drop(['level_0','level_1'], axis = 1)
Grocery_df = grocery_products_df.drop(['image_x', 'image_y', 'fit','similar_item','tech1'], axis =1)
Grocery_df.to_csv('Grocery_reviews.csv')

In [None]:
#Patio metadata
data3 = []
with gzip.open('Amazon_Datasets/meta_Patio_Lawn_and_Garden.json.gz') as f:
    for l in f:
        data3.append(json.loads(l.strip()))
Patio_meta = pd.DataFrame.from_dict(data3)

In [None]:
Patio_meta.to_csv('meta_patio.csv')

In [None]:
#Patio data 
data4 = []
with gzip.open('Patio_Lawn_and_Garden.json.gz') as f:
    for l in f:
        data4.append(json.loads(l.strip()))
Patio = pd.DataFrame.from_dict(data4)

In [None]:
#Merging patio metadata with patio reviews
Patio_merged = Patio.merge(Patio_meta, on = 'asin', how = 'left')

In [None]:
#Selecting products in Patio dataset where there are at least 5 reviews
a = Patio_merged['asin'].value_counts() > 4
Patio_reviews = list(a[a].index)

In [None]:
patio_sample = random.sample(Patio_reviews, k = 3000)
patio_sample_df = Patio_merged[Patio_merged['asin'].isin(patio_sample)]

In [None]:
patio_products_df = patio_sample_df.groupby('asin', as_index=False).apply(fn) #Taking a subsample of only 5 reviews per product
patio_products_df = patio_products_df.reset_index()
print(all(patio_products_df['asin'].value_counts() >= 5))

In [None]:
#Additional cleaning - filling in NA values and removing useless columns 
patio_products_df['vote'] = patio_products_df['vote'].fillna(0)
patio_products_df = patio_products_df.drop(['level_0','level_1'], axis = 1)
Patio_df = patio_products_df.drop(['image_x', 'image_y', 'fit','similar_item','tech1'], axis = 1)
Patio_df.to_csv('Patio_reviews.csv')

In [None]:
#Digital Music data to pandas dataframe
data5 = []
with gzip.open('Digital_Music.json.gz') as f:
    for l in f:
        data5.append(json.loads(l.strip()))
Digital_music = pd.DataFrame.from_dict(data5)

In [None]:
data6 = []
with gzip.open('Amazon_Datasets/meta_Digital_Music.json.gz') as f:
    for l in f:
        data6.append(json.loads(l.strip()))
Digital_music_meta = pd.DataFrame.from_dict(data6)

In [None]:
Digital_music_meta.to_csv('digital_music_meta.csv')

In [None]:
#Merging music metadata with music reviews
Music_merged  = Digital_music.merge(Digital_music_meta, on = 'asin', how = 'left')

In [None]:
#Selecting products in Grocery dataset where there are at least 5 reviews
c = Music_merged['asin'].value_counts() > 4
Music_reviews = list(c[c].index)

In [None]:
music_sample = random.sample(Music_reviews, k= 3000)
music_sample_df = Music_merged[Music_merged['asin'].isin(music_sample)]

In [None]:
music_df = music_sample_df.groupby('asin', as_index = False).apply(fn)
music_df = music_df.reset_index()
print(all(music_df['asin'].value_counts() >=5))

In [None]:
music_df = music_df.drop(['level_0','level_1'], axis = 1)
music_df['vote'] = music_df['vote'].fillna(0)
Music_df = music_df.drop(['image_x', 'image_y','category','date','main_cat','feature'], axis = 1)
Music_df.to_csv('Music_reviews.csv')

In [None]:
data7 = []
with gzip.open('Amazon_Datasets/All_Beauty.json.gz') as f:
    for l in f:
        data7.append(json.loads(l.strip()))
All_Beauty = pd.DataFrame.from_dict(data7)

In [None]:
data8 = []
with gzip.open('Amazon_Datasets/meta_All_Beauty.json.gz') as f:
    for l in f:
        data8.append(json.loads(l.strip()))
All_Beauty_meta = pd.DataFrame.from_dict(data8)

In [None]:
All_Beauty_meta.to_csv('all_beauty_meta.csv')

In [None]:
#Merging All_Beauty metadata with Reviews
All_Beauty_merged = All_Beauty.merge(All_Beauty_meta, on = 'asin', how = 'left')

In [None]:
#Selecting products in All beauty dataset where there are at least 5 reviews
d = All_Beauty_merged['asin'].value_counts() > 4
All_Beauty_reviews = list(d[d].index)

In [None]:
All_Beauty_sample = random.sample(All_Beauty_reviews, k = 3000)
all_beauty_sample_df = All_Beauty_merged[All_Beauty_merged['asin'].isin(All_Beauty_sample)]

In [None]:
All_beauty_df = all_beauty_sample_df.groupby('asin', as_index = False).apply(fn)
All_beauty_df = All_beauty_df.reset_index()
print(all(All_beauty_df['asin'].value_counts() >=5))

In [None]:
All_beauty_df = All_beauty_df.drop(['level_0','level_1'], axis = 1)
All_beauty_df['vote'] = All_beauty_df['vote'].fillna(0)
All_beauty_df = All_beauty_df.drop(['image_x', 'image_y','date','main_cat','feature'], axis = 1)
All_beauty_df.to_csv('All_beauty.csv')

In [None]:
data9 = [] 
with gzip.open('Amazon_Datasets/AMAZON_FASHION.json.gz') as f:
    for l in f:
        data9.append(json.loads(l.strip()))
Amazon_Fashion = pd.DataFrame.from_dict(data9)

In [None]:
data10 = [] 
with gzip.open('Amazon_Datasets/meta_AMAZON_FASHION.json.gz') as f:
    for l in f:
        data10.append(json.loads(l.strip()))
meta_Amazon_Fashion = pd.DataFrame.from_dict(data10)

In [None]:
meta_Amazon_Fashion.to_csv('meta_Amazon_Fashion.csv')

In [None]:
#Merging Amazon fashion with its respective metadata 
Amazon_fashion_merged = Amazon_Fashion.merge(meta_Amazon_Fashion, on = 'asin', how = 'left')

In [None]:
#Selecting products in Amazon Fashion dataset where there are at least 5 reviews
e = Amazon_fashion_merged['asin'].value_counts() > 4
Amazon_Fashion_reviews = list(e[e].index)

In [None]:
Amazon_fashion_sample = random.sample(Amazon_Fashion_reviews, k = 3000)
amazon_fashion_sample_df = Amazon_fashion_merged[Amazon_fashion_merged['asin'].isin(Amazon_fashion_sample)]

In [None]:
amazon_fashion_df = amazon_fashion_sample_df.groupby('asin', as_index = False).apply(fn)
amazon_fashion_df = amazon_fashion_df.reset_index()
print(all(amazon_fashion_df['asin'].value_counts() >=5))

In [None]:
amazon_fashion_df = amazon_fashion_df.drop(['level_0','level_1'], axis = 1)
amazon_fashion_df['vote'] = amazon_fashion_df['vote'].fillna(0)
amazon_fashion_df = amazon_fashion_df.drop(['image_x', 'image_y','date','feature'], axis = 1)
amazon_fashion_df.to_csv('amazon_fashion.csv')

In [None]:
data11 = []
with gzip.open('Amazon_Datasets/Appliances.json.gz') as f:
    for l in f:
        data11.append(json.loads(l.strip()))
Appliances = pd.DataFrame.from_dict(data11)

In [None]:
data12 = []
with gzip.open('Amazon_Datasets/meta_Appliances.json.gz') as f:
    for l in f:
        data12.append(json.loads(l.strip()))
meta_Appliances = pd.DataFrame.from_dict(data12)

In [None]:
meta_Appliances.to_csv('meta_Appliances.csv')

In [None]:
#Merging Amazon appliances with respective metadata
Appliances_merged = Appliances.merge(meta_Appliances, on = 'asin', how = 'left')

In [None]:
#Selecting products in Appliances dataset where there are at least 5 reviews
f = Appliances_merged['asin'].value_counts() > 4
Appliances_reviews = list(f[f].index)

In [None]:
appliances_sample = random.sample(Appliances_reviews, k = 3000)
appliances_sample_df = Appliances_merged[Appliances_merged['asin'].isin(appliances_sample)]

In [None]:
amazon_appliances_df = appliances_sample_df.groupby('asin', as_index = False).apply(fn)
amazon_appliances_df = amazon_appliances_df.reset_index()
print(all(amazon_appliances_df['asin'].value_counts() >=5))

In [None]:
amazon_appliances_df = amazon_appliances_df.drop(['level_0','level_1'], axis = 1)
amazon_appliances_df['vote'] = amazon_appliances_df['vote'].fillna(0)
amazon_appliances_df = amazon_appliances_df.drop(['image_x', 'image_y','date','feature'], axis = 1)
amazon_appliances_df.to_csv('Appliances.csv')

In [None]:
data13 = []
with gzip.open('Amazon_Datasets/Arts_Crafts_and_Sewing.json.gz') as f:
    for l in f:
        data13.append(json.loads(l.strip()))
Arts_Crafts= pd.DataFrame.from_dict(data13)

In [None]:
data14 = []
with gzip.open('Amazon_Datasets/meta_Arts_Crafts_and_Sewing.json.gz') as f:
    for l in f:
        data14.append(json.loads(l.strip()))
meta_Arts_Crafts= pd.DataFrame.from_dict(data14)

In [None]:
meta_Arts_Crafts.to_csv('meta_Arts_Crafts.csv')

In [None]:
g = Arts_Crafts['asin'].value_counts() > 4
Arts_Crafts_reviews = list(g[g].index)

In [None]:
Arts_Crafts_sample = random.sample(Arts_Crafts_reviews, k = 3000) 

In [None]:
Arts_Crafts_sample_df = Arts_Crafts[Arts_Crafts['asin'].isin(Arts_Crafts_sample)]

In [None]:
#Merging Arts and crafts metadata
Arts_crafts_merge = Arts_Crafts_sample_df.merge(meta_Arts_Crafts, on = 'asin', how = 'left')

In [None]:
Arts_crafts_df = Arts_crafts_merge.groupby('asin', as_index = False).apply(fn)
Arts_crafts_df = Arts_crafts_df.reset_index()
print(all(Arts_crafts_df['asin'].value_counts() >=5))

In [None]:
Arts_crafts_df = Arts_crafts_df.drop(['level_0','level_1'], axis = 1)
Arts_crafts_df['vote'] = Arts_crafts_df['vote_x'].fillna(0)
Arts_crafts_df = Arts_crafts_df.drop(['image_x', 'image_y','date','main_cat','feature','details','fit','image_y','main_cat','reviewText_y','reviewTime_y','reviewerID_y','style_y','tech1','tech2','unixReviewTime_y','verified_y','vote_y'], axis = 1)
Arts_crafts_df.to_csv('Arts_crafts.csv')

In [None]:
data15 = []
with gzip.open('Amazon_Datasets/Automotive.json.gz') as f:
    for l in f:
        data15.append(json.loads(l.strip()))
Automotive = pd.DataFrame.from_dict(data15)

In [None]:
data16 = []
with gzip.open('Amazon_Datasets/meta_Automotive.json.gz') as f:
    for l in f:
        data16.append(json.loads(l.strip()))
meta_Automotive = pd.DataFrame.from_dict(data16)

In [None]:
meta_Automotive.to_csv('meta_Automotive.csv')

In [None]:
h = Automotive['asin'].value_counts() > 4
Automotive_reviews = list(h[h].index)

In [None]:
Automotive_sample = random.sample(Automotive_reviews, k = 3000)

In [None]:
Automotive_sample_df = Automotive[Automotive['asin'].isin(Automotive_sample)]

In [None]:
#Merging Auto metadata
Automotive_merge = Automotive_sample_df.merge(meta_Automotive, on = 'asin', how = 'left')

In [None]:
Automotive_df = Automotive_merge.groupby('asin', as_index = False).apply(fn)
Automotive_df = Automotive_df.reset_index()
print(all(Automotive_df['asin'].value_counts() >=5))

In [None]:
Automotive_df = Automotive_df.drop(['level_0','level_1'], axis = 1)
Automotive_df['vote'] = Automotive_df['vote'].fillna(0)
Automotive_df = Automotive_df.drop(['image_x', 'image_y','date','main_cat','feature','details','image_y','main_cat','tech1','tech2'], axis = 1)
Automotive_df.to_csv('Automotive.csv')

In [None]:
data17 = []
with gzip.open('Amazon_Datasets/CDs_and_Vinyl.json.gz') as f:
    for l in f:
        data17.append(json.loads(l.strip()))
CDs_and_Vinyl = pd.DataFrame.from_dict(data17)

In [None]:
data18 = []
with gzip.open('Amazon_Datasets/meta_CDs_and_Vinyl.json.gz') as f:
    for l in f:
        data18.append(json.loads(l.strip()))
meta_CDs_and_Vinyl = pd.DataFrame.from_dict(data18)

In [None]:
meta_CDs_and_Vinyl.to_csv('meta_CDs_and_Vinyl.csv')

In [None]:
j = CDs_and_Vinyl_merge['asin'].value_counts() > 4
CDs_and_Vinyl_reviews = list(j[j].index)

In [None]:
#Merging CD's & Vinyl metadata
CDs_and_Vinyl_merge = CDs_and_Vinyl.merge(meta_CDs_and_Vinyl, on = 'asin', how = 'left')

In [None]:
CDs_and_Vinyl_sample = random.sample(CDs_and_Vinyl_reviews, k = 3000)
CDs_and_Vinyl_sample_df = CDs_and_Vinyl_merge[CDs_and_Vinyl_merge['asin'].isin(CDs_and_Vinyl_sample)]

In [None]:
CDs_and_Vinyl_df = CDs_and_Vinyl_sample_df.groupby('asin', as_index = False).apply(fn)
CDs_and_Vinyl_df = CDs_and_Vinyl_df.reset_index()
print(all(CDs_and_Vinyl_df['asin'].value_counts() >=5))

In [None]:
CDs_and_Vinyl_df = CDs_and_Vinyl_df.drop(['level_0','level_1'], axis = 1)
CDs_and_Vinyl_df['vote'] = CDs_and_Vinyl_df['vote'].fillna(0)
CDs_and_Vinyl_df= CDs_and_Vinyl_df.drop(['image_x', 'image_y','date','feature'], axis = 1)
CDs_and_Vinyl_df.to_csv('CDs_and_Vinyl.csv')

In [None]:
data19 = []
with gzip.open('Amazon_Datasets/Industrial_and_Scientific.json.gz') as f:
    for l in f:
        data19.append(json.loads(l.strip()))
Industrial_and_Scientific = pd.DataFrame.from_dict(data19)

In [None]:
data20 = []
with gzip.open('Amazon_Datasets/meta_Industrial_and_Scientific.json.gz') as f:
    for l in f:
        data20.append(json.loads(l.strip()))
meta_Industrial_and_Scientific = pd.DataFrame.from_dict(data20)

In [None]:
meta_Industrial_and_Scientific.to_csv('meta_Industrial_and_Scientific.csv')

In [None]:
k = Industrial_and_Scientific['asin'].value_counts() > 4
Industrial_reviews = list(k[k].index)

In [None]:
Industrial_sample = random.sample(Industrial_reviews, k = 3000)
Industrial_sample_df = Industrial_and_Scientific[Industrial_and_Scientific['asin'].isin(Industrial_sample)]

In [None]:
#Merging Industrial and scientific metadata
Industrial_merge = Industrial_sample_df.merge(meta_Industrial_and_Scientific, on = 'asin', how = 'left')

In [None]:
Industrial_df = Industrial_merge.groupby('asin', as_index = False).apply(fn)
Industrial_df = Industrial_df.reset_index()
print(all(Industrial_df['asin'].value_counts() >= 5))

In [None]:
Industrial_df = Industrial_df.drop(['level_0','level_1'], axis = 1)
Industrial_df['vote'] = Industrial_df['vote'].fillna(0)
Industrial_df= Industrial_df.drop(['image_x', 'image_y','date','feature', 'tech1', 'tech2'], axis = 1)
Industrial_df.to_csv('Industrial_and_Scientific.csv')

In [None]:
data21 = []
with gzip.open('Amazon_Datasets/Kindle_Store.json.gz') as f:
    for l in f:
        data21.append(json.loads(l.strip()))
Kindle_Store = pd.DataFrame.from_dict(data21)

In [None]:
data22 = []
with gzip.open('Amazon_Datasets/meta_Kindle_Store.json.gz') as f:
    for l in f:
        data22.append(json.loads(l.strip()))
meta_Kindle_Store = pd.DataFrame.from_dict(data22)

In [None]:
meta_Kindle_Store.to_csv('meta_Kindle_Store.csv')

In [None]:
L = Kindle_Store['asin'].value_counts() > 4
Kindle_reviews = list(L[L].index)

In [None]:
Kindle_sample = random.sample(Kindle_reviews, k = 3000)
Kindle_sample_df = Kindle_Store[Kindle_Store['asin'].isin(Kindle_sample)]

In [None]:
#Merging Cell phones and metadata
Kindle_merge = Kindle_sample_df.merge(meta_Kindle_Store, on = 'asin', how = 'left')

In [None]:
Kindle_df = Kindle_merge.groupby('asin', as_index = False).apply(fn)
Kindle_df = Kindle_df.reset_index()
print(all(Kindle_df['asin'].value_counts() >= 5))

In [None]:
Kindle_df.head()

In [None]:
Kindle_df = Kindle_df.drop(['level_0','level_1'], axis = 1)
Kindle_df['vote'] = Kindle_df['vote'].fillna(0)
Kindle_df= Kindle_df.drop(['image_x', 'image_y','date','feature', 'tech1', 'tech2'], axis = 1)
Kindle_df.to_csv('Kindle.csv')

In [None]:
data23 = []
with gzip.open('Amazon_Datasets/Luxury_Beauty.json.gz') as f:
    for l in f:
        data23.append(json.loads(l.strip()))
Luxury_Beauty= pd.DataFrame.from_dict(data23)

In [None]:
data24 = []
with gzip.open('Amazon_Datasets/meta_Luxury_Beauty.json.gz') as f:
    for l in f:
        data24.append(json.loads(l.strip()))
meta_Luxury_Beauty = pd.DataFrame.from_dict(data24)

In [None]:
m = Luxury_Beauty['asin'].value_counts() > 4
Luxury_Beauty_reviews = list(m[m].index)

In [None]:
Luxury_Beauty_sample = random.sample(Luxury_Beauty_reviews, k = 3000)
Luxury_Beauty_sample_df = Luxury_Beauty[Luxury_Beauty['asin'].isin(Luxury_Beauty_sample)]

In [None]:
Luxury_merge = Luxury_Beauty_sample_df.merge(meta_Luxury_Beauty, on = 'asin', how = 'left')

In [None]:
Luxury_Beauty_df = Luxury_merge.groupby('asin', as_index = False).apply(fn)
Luxury_Beauty_df = Luxury_Beauty_df.reset_index()
print(all(Luxury_Beauty_df['asin'].value_counts() >= 5))

In [None]:
Luxury_df = Luxury_Beauty_df.drop(['level_0','level_1'], axis = 1)
Luxury_df['vote'] = Luxury_df['vote'].fillna(0)
Luxury_df= Luxury_df.drop(['image_x', 'image_y','feature','brand'], axis = 1)
Luxury_df.to_csv('Luxury_Beauty.csv')

In [None]:
data25 = []
with gzip.open('Amazon_Datasets/Movies_and_TV.json.gz') as f:
    for l in f:
        data25.append(json.loads(l.strip()))
Movies_and_TV= pd.DataFrame.from_dict(data25)

In [None]:
data26 = []
with gzip.open('Amazon_Datasets/meta_Movies_and_TV.json.gz') as f:
    for l in f:
        data26.append(json.loads(l.strip()))
meta_Movies_and_TV = pd.DataFrame.from_dict(data26)

In [None]:
n = Movies_and_TV['asin'].value_counts() > 4
Movies_and_TV_reviews = list(n[n].index)

In [None]:
Movies_and_TV_sample = random.sample(Movies_and_TV_reviews, k = 3000)
Movies_and_TV_sample_df = Movies_and_TV[Movies_and_TV['asin'].isin(Movies_and_TV_sample)]

In [None]:
Movies_and_TV_merge = Movies_and_TV_sample_df.merge(meta_Movies_and_TV, on = 'asin', how = 'left')

In [None]:
Movies_and_TV_df = Movies_and_TV_merge.groupby('asin', as_index = False).apply(fn)
Movies_and_TV_df = Movies_and_TV_df.reset_index()
print(all(Movies_and_TV_df['asin'].value_counts() >= 5))

In [None]:
Movies_and_TV_df = Movies_and_TV_df.drop(['level_0','level_1'], axis = 1)
Movies_and_TV_df['vote'] = Movies_and_TV_df['vote'].fillna(0)
Movies_and_TV_df= Movies_and_TV_df.drop(['image_x', 'image_y','feature','brand'], axis = 1)
Movies_and_TV_df.to_csv('Movies_and_TV.csv')

In [None]:
data27 = []
with gzip.open('Amazon_Datasets/Musical_Instruments.json.gz') as f:
    for l in f:
        data27.append(json.loads(l.strip()))
Musical_Instruments= pd.DataFrame.from_dict(data27)

In [None]:
data28 = []
with gzip.open('Amazon_Datasets/meta_Musical_Instruments.json.gz') as f:
    for l in f:
        data28.append(json.loads(l.strip()))
meta_Musical_Instruments = pd.DataFrame.from_dict(data28)

In [None]:
meta_Musical_Instruments.to_csv('meta_Musical_Instruments.csv')

In [None]:
o = Musical_Instruments['asin'].value_counts() > 4
Musical_instruments = list(o[o].index)

In [None]:
Musical_instruments_sample = random.sample(Musical_instruments, k = 3000)
Musical_instruments_sample_df = Musical_Instruments[Musical_Instruments['asin'].isin(Musical_instruments_sample)]

In [None]:
Musical_instruments_merge = Musical_instruments_sample_df.merge(meta_Musical_Instruments, on = 'asin', how = 'left')

In [None]:
Musical_instruments_df = Musical_instruments_merge.groupby('asin', as_index = False).apply(fn)
Musical_instruments_df = Musical_instruments_df.reset_index()
print(all(Musical_instruments_df['asin'].value_counts() >= 5))

In [None]:
Musical_instruments_df = Musical_instruments_df.drop(['level_0','level_1'], axis = 1)
Musical_instruments_df['vote'] = Musical_instruments_df['vote'].fillna(0)
Musical_instruments_df= Musical_instruments_df.drop(['image_x', 'image_y','feature','brand','fit','main_cat','tech1','tech2'], axis = 1)
Musical_instruments_df.to_csv('Musical_instruments.csv')

In [None]:
data29 = []
with gzip.open('Amazon_Datasets/Office_Products.json.gz') as f:
    for l in f:
        data29.append(json.loads(l.strip()))
Office_Products = pd.DataFrame.from_dict(data29)

In [None]:
data30 = []
with gzip.open('Amazon_Datasets/meta_Office_Products.json.gz') as f:
    for l in f:
        data30.append(json.loads(l.strip()))
meta_Office_Products = pd.DataFrame.from_dict(data30)

In [None]:
meta_Office_Products.to_csv('meta_Office_Products.csv')

In [None]:
p = Office_Products['asin'].value_counts() > 4
Office_products = list(p[p].index)

In [None]:
Office_products_sample = random.sample(Office_products, k = 3000)
Office_products_sample_df = Office_Products[Office_Products['asin'].isin(Office_products_sample)]

In [None]:
Office_products_merge = Office_products_sample_df.merge(meta_Office_Products, on = 'asin', how = 'left')

In [None]:
Office_products_df = Office_products_merge.groupby('asin', as_index = False).apply(fn)
Office_products_df = Office_products_df.reset_index()
print(all(Office_products_df['asin'].value_counts() >= 5))

In [None]:
Office_products_df = Office_products_df.drop(['level_0','level_1'], axis = 1)
Office_products_df['vote'] = Office_products_df['vote'].fillna(0)
Office_products_df= Office_products_df.drop(['image_x', 'image_y','feature','brand','fit','main_cat','tech1','tech2'], axis = 1)
Office_products_df.to_csv('Office_products.csv')

In [None]:
data31 = []
with gzip.open('Amazon_Datasets/Pet_Supplies.json.gz') as f:
    for l in f:
        data31.append(json.loads(l.strip()))
Pet_Supplies = pd.DataFrame.from_dict(data31)

In [None]:
data32 = []
with gzip.open('Amazon_Datasets/meta_Pet_Supplies.json.gz') as f:
    for l in f:
        data32.append(json.loads(l.strip()))
meta_Pet_Supplies = pd.DataFrame.from_dict(data32)

In [None]:
q = Pet_Supplies['asin'].value_counts() > 4
Pet_supplies = list(q[q].index)

In [None]:
Pet_supplies_sample = random.sample(Pet_supplies, k = 3000)
Pet_supplies_sample_df = Pet_Supplies[Pet_Supplies['asin'].isin(Pet_supplies_sample)]

In [None]:
Pet_supplies_merge = Pet_supplies_sample_df.merge(meta_Pet_Supplies, how = 'left', on = 'asin')

In [None]:
Pet_supplies_df = Pet_supplies_merge.groupby('asin', as_index = False).apply(fn)
Pet_supplies_df = Pet_supplies_df.reset_index()
print(all(Pet_supplies_df['asin'].value_counts() >= 5))

In [None]:
Pet_supplies_df = Pet_supplies_df.drop(['level_0','level_1'], axis = 1)
Pet_supplies_df['vote'] = Pet_supplies_df['vote'].fillna(0)
Pet_supplies_df= Pet_supplies_df.drop(['image_x', 'image_y','feature','fit','tech1','tech2'], axis = 1)
Pet_supplies_df.to_csv('Pet_supplies.csv')

In [None]:
data33 = []
with gzip.open('Amazon_Datasets/Prime_Pantry.json.gz') as f:
    for l in f:
        data33.append(json.loads(l.strip()))
Prime_Pantry = pd.DataFrame.from_dict(data33)

In [None]:
data34 = []
with gzip.open('Amazon_Datasets/meta_Prime_Pantry.json.gz') as f:
    for l in f:
        data34.append(json.loads(l.strip()))
meta_Prime_Pantry = pd.DataFrame.from_dict(data34)

In [None]:
meta_Prime_Pantry.to_csv('meta_Prime_Pantry.csv')

In [None]:
r = Prime_Pantry['asin'].value_counts() > 4
Prime_pantry = list(r[r].index)

In [None]:
Prime_pantry_sample = random.sample(Prime_pantry, k = 3000)
Prime_pantry_sample_df = Prime_Pantry[Prime_Pantry['asin'].isin(Prime_pantry_sample)]

In [None]:
Prime_pantry_merge = Prime_pantry_sample_df.merge(meta_Prime_Pantry, how = 'left', on = 'asin')

In [None]:
Prime_pantry_df = Prime_pantry_merge.groupby('asin', as_index = False).apply(fn)
Prime_pantry_df = Prime_pantry_df.reset_index()
print(all(Prime_pantry_df['asin'].value_counts() >= 5))

In [None]:
Prime_pantry_df = Prime_pantry_df.drop(['level_0','level_1'], axis = 1)
Prime_pantry_df['vote'] = Prime_pantry_df['vote'].fillna(0)
Prime_pantry_df= Prime_pantry_df.drop(['image_x', 'image_y','feature'], axis = 1)
Prime_pantry_df.to_csv('Prime_pantry.csv')

In [None]:
data35 = []
with gzip.open('Amazon_Datasets/Software.json.gz') as f:
    for l in f:
        data35.append(json.loads(l.strip()))
Software = pd.DataFrame.from_dict(data35)

In [None]:
data36 = []
with gzip.open('Amazon_Datasets/meta_Software.json.gz') as f:
    for l in f:
        data36.append(json.loads(l.strip()))
meta_Software = pd.DataFrame.from_dict(data36)

In [None]:
s = Software['asin'].value_counts() > 4
software = list(s[s].index)

In [None]:
software_sample = random.sample(software, k = 3000)
software_sample_df = Software[Software['asin'].isin(software_sample)]

In [None]:
software_merge = software_sample_df.merge(meta_Software, how = 'left', on = 'asin')

In [None]:
software_df = software_merge.groupby('asin', as_index = False).apply(fn)
software_df = software_df.reset_index()
print(all(software_df['asin'].value_counts() >= 5))

In [None]:
software_df = software_df.drop(['level_0','level_1'], axis = 1)
software_df['vote'] = software_df['vote'].fillna(0)
software_df= software_df.drop(['image_x', 'image_y','feature'], axis = 1)
software_df.to_csv('Software.csv')

In [None]:
data37 = []
with gzip.open('Amazon_Datasets/Tools_and_Home_Improvement.json.gz') as f:
    for l in f:
        data37.append(json.loads(l.strip()))
Tools = pd.DataFrame.from_dict(data37)

In [None]:
data38 = []
with gzip.open('Amazon_Datasets/meta_Tools_and_Home_Improvement.json.gz') as f:
    for l in f:
        data38.append(json.loads(l.strip()))
meta_Tools = pd.DataFrame.from_dict(data38)

In [None]:
t = Tools['asin'].value_counts() > 4
tools = list(t[t].index)

In [None]:
tools_sample = random.sample(tools, k = 3000)
tools_sample_df = Tools[Tools['asin'].isin(tools_sample)]

In [None]:
tools_merge = tools_sample_df.merge(meta_Tools, how = 'left', on = 'asin')

In [None]:
tools_df = tools_merge.groupby('asin', as_index = False).apply(fn)
tools_df = tools_df.reset_index()
print(all(tools_df['asin'].value_counts() >= 5))

In [None]:
tools_df = tools_df.drop(['level_0','level_1'], axis = 1)
tools_df['vote'] = tools_df['vote'].fillna(0)
tools_df= tools_df.drop(['image_x', 'image_y','feature', 'tech1', 'tech2'], axis = 1)
tools_df.to_csv('Tools.csv')

In [None]:
data39 = []
with gzip.open('Amazon_Datasets/Toys_and_Games.json.gz') as f:
    for l in f:
        data39.append(json.loads(l.strip()))
Toys_and_Games = pd.DataFrame.from_dict(data39)

In [None]:
data40 = []
with gzip.open('Amazon_Datasets/meta_Toys_and_Games.json.gz') as f:
    for l in f:
        data40.append(json.loads(l.strip()))
meta_Toys_and_Games = pd.DataFrame.from_dict(data40)

In [None]:
u = Toys_and_Games['asin'].value_counts() > 4
toys = list(u[u].index)

In [None]:
toys_sample = random.sample(toys, k = 3000)
toys_sample_df = Toys_and_Games[Toys_and_Games['asin'].isin(toys_sample)]

In [None]:
toys_merge = toys_sample_df.merge(meta_Toys_and_Games, how = 'left', on = 'asin')

In [None]:
toys_df = toys_merge.groupby('asin', as_index = False).apply(fn)
toys_df = toys_df.reset_index()
print(all(toys_df['asin'].value_counts() >= 5))

In [None]:
toys_df = toys_df.drop(['level_0','level_1'], axis = 1)
toys_df['vote'] = toys_df['vote'].fillna(0)
toys_df= toys_df.drop(['image_x', 'image_y', 'tech1', 'tech2'], axis = 1)
toys_df.to_csv('Toys.csv')

In [None]:
data42 = []
with gzip.open('Amazon_Datasets/Video_Games.json.gz') as f:
    for l in f:
        data42.append(json.loads(l.strip()))
Video_Games = pd.DataFrame.from_dict(data42)

In [None]:
data43 = []
with gzip.open('Amazon_Datasets/meta_Video_Games.json.gz') as f:
    for l in f:
        data43.append(json.loads(l.strip()))
meta_Video_Games = pd.DataFrame.from_dict(data43)

In [None]:
v = Video_Games['asin'].value_counts() > 4
video_games = list(v[v].index)

In [None]:
video_games_sample = random.sample(video_games, k = 3000)
video_games_sample_df = Video_Games[Video_Games['asin'].isin(video_games_sample)]

In [None]:
video_games_merge = video_games_sample_df.merge(meta_Video_Games, how = 'left', on = 'asin')

In [None]:
video_games_df = video_games_merge.groupby('asin', as_index = False).apply(fn)
video_games_df = video_games_df.reset_index()
print(all(video_games_df['asin'].value_counts() >= 5))

In [None]:
video_games_df = video_games_df.drop(['level_0','level_1'], axis = 1)
video_games_df['vote'] = video_games_df['vote'].fillna(0)
video_games_df= video_games_df.drop(['image_x', 'image_y', 'tech1', 'tech2'], axis = 1)
video_games_df.to_csv('Video_Games.csv')