In [None]:
import pandas as pd
import gzip
import json

# Helper function to load JSON data from a .gz file
def load_json_gz(filename):
    with gzip.open(filename, 'r') as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame(data)

from concurrent.futures import ThreadPoolExecutor

# Function to process a single line of JSON data
def process_line(line):
    return json.loads(line)

# Updated helper function to load JSON data from a .gz file using multiple threads
def load_json_gz_multithreaded(filename, max_workers=16):
    # Create a thread pool executor with the specified number of workers
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Open the gzip file
        with gzip.open(filename, 'r') as f:
            # Submit each line to the executor to process in parallel
            # The map function returns a generator, so we use list() to wait for all tasks to complete
            data = list(executor.map(process_line, (line for line in f)))
    # Create a DataFrame from the processed data
    return pd.DataFrame(data)
    
# Load reviews and details data from .json.gz
reviews_data_domain1 = load_json_gz_multithreaded('./datasets/Sports_and_Outdoors_5.json.gz')
print("load1 success")

# reviews_data_domain1.columns = ['asin' ,'reviewerID', 'overall', 'timesteamp']
reviews_data_domain1 = reviews_data_domain1[['reviewerID', 'asin', 'reviewText', 'overall']]

# Filter out reviews with 'overall' score less than 3.0
reviews_data_domain1 = reviews_data_domain1[reviews_data_domain1['overall'] >= 3.0]

# Filter users with fewer than 5 interactions
reviews_data_domain1 = reviews_data_domain1.groupby('reviewerID').filter(lambda x: len(x) >= 5)

# Filter items with fewer than 5 interactions
reviews_data_domain1 = reviews_data_domain1.groupby('asin').filter(lambda x: len(x) >= 20)

print("process1 success")


# Load details data from .json.gz
details_data_domain1 = load_json_gz_multithreaded('./datasets/meta_Sports_and_Outdoors.json.gz')
print("load2 success")

# Selecting required columns from the details data
details_data_domain1 = details_data_domain1[['asin', 'title']]

print("process2 success")
# Merge the details with the filtered reviews data on 'asin' for each domain
merged_data_domain1 = pd.merge(reviews_data_domain1, details_data_domain1, on='asin', how='left')

print(merged_data_domain1)

In [2]:
df = merged_data_domain1[['reviewerID', 'asin', 'title']]
df

Unnamed: 0,reviewerID,asin,title
0,A5N0QU8JBRVQQ,0899332757,Delorme New York State Atlas &amp; Gazetteer
1,ANGLIGXJHXC6D,0899332757,Delorme New York State Atlas &amp; Gazetteer
2,ASAE9FBGAJQAH,0899332757,Delorme New York State Atlas &amp; Gazetteer
3,A1TAW91SQSU2AH,0899332757,Delorme New York State Atlas &amp; Gazetteer
4,A1SVGO4GMEOXQ0,0899332757,Delorme New York State Atlas &amp; Gazetteer
...,...,...,...
1876451,A1RYJI5X3F0FOC,B01HJ4DN08,Savage Gear 3D Topwater Duck
1876452,A1I1H15HSMP987,B01HJ4DN08,Savage Gear 3D Topwater Duck
1876453,A21PFT0LWXTUA2,B01HJ4DN08,Savage Gear 3D Topwater Duck
1876454,A1HCLZ57P6O0YM,B01HJDGJ1E,"Soft Cooling Towel for Instant Relief, 48inch ..."


In [3]:
df_new = df.groupby('reviewerID').agg(list).reset_index()
df_new

Unnamed: 0,reviewerID,asin,title
0,A0001528BGUBOEVR6T5U,"[B0009PUQ8M, B000E7SY02, B001QBJT5Y, B0031YS5U...","[Coleman Tent Kit, ROLA 59400 TX Hitch Mount 2..."
1,A0024836TBQJ1WBE4VDY,"[B005HSZG5E, B00AU67NEA, B00E2217I6, B00PRVLMK...",[uxcell&reg; Silvery Cool Grenade Shaped Car T...
2,A0029274J35Q1MYNKUWO,"[B000E59A8E, B000LC5S18, B000N8LMWY, B000N8MUR...","[Otis M-16 Chamber Brush, Long Charging Die, L..."
3,A0036244LFNO78OM4135,"[B000FELYVA, B001N3MKT2, B003BLP0I2, B0059ESJR...","[TYR Sport Men's Solid Jammer Swim Suit, Nalge..."
4,A0065811S4ANKB4TYHII,"[B0002IMP94, B0002IMP94, B00DB6ZD84, B00LR454Y...",[LimbSaver Classic Precision-Fit Recoil Pad fo...
...,...,...,...
291337,AZZYW4YOE1B6E,"[B00095M5DQ, B000HZGTUS, B005YPK9SY, B006HFBQ0...",[Ultra Pro 9-Pocket Trading Card Pages - Plati...
291338,AZZZJJPHNUQ1T,"[B000ITX474, B00N35HLG0, B004LSBBS2]",[Texsport 01113 Hide-A-Way Camouflage Hexagon...
291339,AZZZOVIBXHGDR,"[B000F7P292, B000F7V6KG, B001454768, B001454768]",[Radians Revelation Protective Shooting Glasse...
291340,AZZZPDCSVJ9X2,"[B0007TAJ0K, B001C1UGVO, B004V5XTOE, B0051F8PS...","[Allen Universal Archery Broadhead Wrench, Es..."


In [4]:
df_new = df_new[df_new['asin'].apply(lambda x: len(x) >= 5)]
df_new

Unnamed: 0,reviewerID,asin,title
0,A0001528BGUBOEVR6T5U,"[B0009PUQ8M, B000E7SY02, B001QBJT5Y, B0031YS5U...","[Coleman Tent Kit, ROLA 59400 TX Hitch Mount 2..."
1,A0024836TBQJ1WBE4VDY,"[B005HSZG5E, B00AU67NEA, B00E2217I6, B00PRVLMK...",[uxcell&reg; Silvery Cool Grenade Shaped Car T...
2,A0029274J35Q1MYNKUWO,"[B000E59A8E, B000LC5S18, B000N8LMWY, B000N8MUR...","[Otis M-16 Chamber Brush, Long Charging Die, L..."
3,A0036244LFNO78OM4135,"[B000FELYVA, B001N3MKT2, B003BLP0I2, B0059ESJR...","[TYR Sport Men's Solid Jammer Swim Suit, Nalge..."
4,A0065811S4ANKB4TYHII,"[B0002IMP94, B0002IMP94, B00DB6ZD84, B00LR454Y...",[LimbSaver Classic Precision-Fit Recoil Pad fo...
...,...,...,...
291334,AZZW780H8VJ8N,"[B001O2PKGM, B00XJ5IUGE, B01AAGVWJG, B01E4YVCO...",[F W Klever GmbH Ballistol Multi-Purpose Aeros...
291335,AZZYI1EW1WY3M,"[B000YXBC5U, B000YXBC5U, B001PNPK5G, B00CPJ546...","[A&amp;R Sports Blade Cover, A&amp;R Sports Bl..."
291337,AZZYW4YOE1B6E,"[B00095M5DQ, B000HZGTUS, B005YPK9SY, B006HFBQ0...",[Ultra Pro 9-Pocket Trading Card Pages - Plati...
291340,AZZZPDCSVJ9X2,"[B0007TAJ0K, B001C1UGVO, B004V5XTOE, B0051F8PS...","[Allen Universal Archery Broadhead Wrench, Es..."


In [5]:
# id hash 
from itertools import chain

item_unique = list(df_new['asin'])
item_set = set(list(chain(*item_unique)))
item_hash_dict = dict()
i = 0 
for j in item_set:
    item_hash_dict[j] = i
    i += 1
    
# print(dict(list(item_hash_dict.items())[:10]))
def get_item_hash(item):
    return [item_hash_dict[i] for i in item]
        
df_new['asin'] = df_new['asin'].apply(get_item_hash)
df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['asin'] = df_new['asin'].apply(get_item_hash)


Unnamed: 0,reviewerID,asin,title
0,A0001528BGUBOEVR6T5U,"[20488, 8343, 3162, 14281, 2627, 24595, 25489,...","[Coleman Tent Kit, ROLA 59400 TX Hitch Mount 2..."
1,A0024836TBQJ1WBE4VDY,"[10435, 2252, 15804, 13489, 13172]",[uxcell&reg; Silvery Cool Grenade Shaped Car T...
2,A0029274J35Q1MYNKUWO,"[13702, 5876, 20973, 22197, 4098, 16520, 18926...","[Otis M-16 Chamber Brush, Long Charging Die, L..."
3,A0036244LFNO78OM4135,"[9168, 5335, 21747, 14506, 2089, 17012]","[TYR Sport Men's Solid Jammer Swim Suit, Nalge..."
4,A0065811S4ANKB4TYHII,"[12843, 12843, 13539, 759, 9714, 7102]",[LimbSaver Classic Precision-Fit Recoil Pad fo...
...,...,...,...
291334,AZZW780H8VJ8N,"[10215, 15753, 5195, 14657, 6985, 2709]",[F W Klever GmbH Ballistol Multi-Purpose Aeros...
291335,AZZYI1EW1WY3M,"[14007, 14007, 13864, 21752, 19444, 6894, 5292...","[A&amp;R Sports Blade Cover, A&amp;R Sports Bl..."
291337,AZZYW4YOE1B6E,"[9339, 25843, 11190, 9669, 16581, 23910]",[Ultra Pro 9-Pocket Trading Card Pages - Plati...
291340,AZZZPDCSVJ9X2,"[9578, 26248, 3016, 7719, 6427, 9036, 22803, 5...","[Allen Universal Archery Broadhead Wrench, Es..."


In [6]:
# print(df_new['title'].tolist()[-2])
def remove_duplicates(row):
    seen = set()
    new_asin = []
    new_title = []
    for a, t in zip(row['asin'], row['title']):
        if a not in seen:
            seen.add(a)
            new_asin.append(a)
            new_title.append(t)
    return pd.Series([new_asin, new_title], index=['asin', 'title'])

# 应用函数并更新 DataFrame
df_new[['asin', 'title']] = df_new.apply(remove_duplicates, axis=1)
df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,reviewerID,asin,title
0,A0001528BGUBOEVR6T5U,"[20488, 8343, 3162, 14281, 2627, 24595, 25489,...","[Coleman Tent Kit, ROLA 59400 TX Hitch Mount 2..."
1,A0024836TBQJ1WBE4VDY,"[10435, 2252, 15804, 13489, 13172]",[uxcell&reg; Silvery Cool Grenade Shaped Car T...
2,A0029274J35Q1MYNKUWO,"[13702, 5876, 20973, 22197, 4098, 16520, 18926...","[Otis M-16 Chamber Brush, Long Charging Die, L..."
3,A0036244LFNO78OM4135,"[9168, 5335, 21747, 14506, 2089, 17012]","[TYR Sport Men's Solid Jammer Swim Suit, Nalge..."
4,A0065811S4ANKB4TYHII,"[12843, 13539, 759, 9714, 7102]",[LimbSaver Classic Precision-Fit Recoil Pad fo...
...,...,...,...
291334,AZZW780H8VJ8N,"[10215, 15753, 5195, 14657, 6985, 2709]",[F W Klever GmbH Ballistol Multi-Purpose Aeros...
291335,AZZYI1EW1WY3M,"[14007, 13864, 21752, 19444, 6894, 5292, 6703,...","[A&amp;R Sports Blade Cover, FuelBelt Reflecti..."
291337,AZZYW4YOE1B6E,"[9339, 25843, 11190, 9669, 16581, 23910]",[Ultra Pro 9-Pocket Trading Card Pages - Plati...
291340,AZZZPDCSVJ9X2,"[9578, 26248, 3016, 7719, 6427, 9036, 22803, 5...","[Allen Universal Archery Broadhead Wrench, Es..."


In [7]:
df_new = df_new[df_new['asin'].apply(lambda x: len(x) >= 5)] #129035

In [8]:
df_new.to_csv("./datasets/sport.csv")