### 1.GET UNIVERSAL CSV:
### ui_df (user_id,item_id,time) sorted by time,
### u_df (reviewerID,user_id) sorted by reviewerID,
### i_df (asin,item_id) sorted by asin

In [1]:
import random

import pandas as pd
import gzip
import os
import subprocess

os.chdir("/data/Chester/preprocessing")
os.getcwd()

'/data/Chester/preprocessing'

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
DATASET="Beauty"
RAW_PATH=os.path.join('./',DATASET)
DATA_FILE="reviews_{}_5.json.gz".format(DATASET)
META_FILE="meta_{}.json.gz".format(DATASET)


In [4]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

Downloading interaction data into ./Beauty


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 42.7M  100 42.7M    0     0  3215k      0  0:00:13  0:00:13 --:--:-- 7991k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

Downloading item metadata into ./Beauty


100 94.5M  100 94.5M    0     0  5132k      0  0:00:18  0:00:18 --:--:-- 9161k


In [5]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()
ui_df=data_df.rename(columns={'asin':"item_id",'reviewerID':"user_id",'unixReviewTime':'time'})
ui_df=ui_df[['user_id','item_id','time']]

In [6]:
print(data_df.shape,ui_df.shape)

(198502, 9) (198502, 3)


In [7]:
ui_df=ui_df.sort_values(by=['time','user_id'],kind='mergesort').reset_index(drop=True)
data_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014"
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3.0,OK Palette!,1397779200,"04 18, 2014"
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013"
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2.0,Do not work on my face,1386460800,"12 8, 2013"
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013"


In [8]:
ui_df.head()

Unnamed: 0,user_id,item_id,time
0,A281NPSIMI1C2R,B0000535UX,1023840000
1,A281NPSIMI1C2R,B0000535UM,1024185600
2,A281NPSIMI1C2R,B0000535UN,1024185600
3,AWIF8AR75LL9L,B000065DK4,1036627200
4,A281NPSIMI1C2R,B000052Y33,1052611200


### Check nan and duplicates data

In [9]:
userID,itemID,timestamp="user_id","item_id","time"
ui_df.dropna(subset=[userID,itemID,timestamp],inplace=True)
ui_df.drop_duplicates(subset=[userID,itemID,timestamp],inplace=True)
print(f'After dropped:{ui_df.shape}')
ui_df[:5]

After dropped:(198502, 3)


Unnamed: 0,user_id,item_id,time
0,A281NPSIMI1C2R,B0000535UX,1023840000
1,A281NPSIMI1C2R,B0000535UM,1024185600
2,A281NPSIMI1C2R,B0000535UN,1024185600
3,AWIF8AR75LL9L,B000065DK4,1036627200
4,A281NPSIMI1C2R,B000052Y33,1052611200


### Check K-core

In [10]:
from collections import Counter
import numpy as np

min_u_num, min_i_num = 5, 5

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):
    if field is None:
        return set()
    if max_num is None and min_num is None:
        return set()

    max_num = max_num or np.inf
    min_num = min_num or -1

    ids = df[field].values
    inter_num = Counter(ids)
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}
    print(f'{len(ids)} illegal_ids_by_inter_num, field={field}')

    return ids


def filter_by_k_core(df):
    while True:
        ban_users = get_illegal_ids_by_inter_num(df, field=userID, max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field=itemID, max_num=None, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            return

        dropped_inter = pd.Series(False, index=df.index)
        if userID:
            dropped_inter |= df[userID].isin(ban_users)
        if itemID:
            dropped_inter |= df[itemID].isin(ban_items)
        print(f'{len(dropped_inter)} dropped interactions')
        df.drop(df.index[dropped_inter], inplace=True)
filter_by_k_core(ui_df)
print(f'k-core shape: {ui_df.shape}')
print(f'shape after k-core: {ui_df.shape}')
ui_df[:5]

0 illegal_ids_by_inter_num, field=user_id
0 illegal_ids_by_inter_num, field=item_id
k-core shape: (198502, 3)
shape after k-core: (198502, 3)


Unnamed: 0,user_id,item_id,time
0,A281NPSIMI1C2R,B0000535UX,1023840000
1,A281NPSIMI1C2R,B0000535UM,1024185600
2,A281NPSIMI1C2R,B0000535UN,1024185600
3,AWIF8AR75LL9L,B000065DK4,1036627200
4,A281NPSIMI1C2R,B000052Y33,1052611200


In [11]:
i_mapping_file='i_id_mapping.csv'
u_mapping_file='u_id_mapping.csv'

uni_users=sorted(pd.unique(ui_df[userID]))
uni_items=sorted(pd.unique(ui_df[itemID]))
n_users=len(uni_users)
n_items=len(uni_items)
print(f'n_users:{n_users}')
print(f'n_items:{n_items}')
print(f'iteractions:{ui_df.shape[0]}')
print('sparsity percent:{:.2%}'.format(1-ui_df.shape[0]/(n_users*n_items)))

n_users:22363
n_items:12101
iteractions:198502
sparsity percent:99.93%


In [12]:
#start from 0
u_id_map={k:i for i,k in enumerate(uni_users)}
i_id_map={k:i for i,k in enumerate(uni_items)}

ui_df[userID]=ui_df[userID].map(u_id_map)
ui_df[itemID]=ui_df[itemID].map(i_id_map)
ui_df[userID]=ui_df[userID].astype(int)
ui_df[itemID]=ui_df[itemID].astype(int)

u_df = pd.DataFrame(list(u_id_map.items()), columns=['reviewerID', userID])
i_df = pd.DataFrame(list(i_id_map.items()), columns=['asin', itemID])

u_df.to_csv(os.path.join(RAW_PATH, u_mapping_file), sep='\t', index=False)
i_df.to_csv(os.path.join(RAW_PATH, i_mapping_file), sep='\t', index=False)

print('Load mapping')

Load mapping


In [13]:
ui_df.head()

Unnamed: 0,user_id,item_id,time
0,7187,103,1023840000
1,7187,100,1024185600
2,7187,101,1024185600
3,21814,151,1036627200
4,7187,22,1052611200


In [14]:
u_df.head()

Unnamed: 0,reviewerID,user_id
0,A00414041RD0BXM6WK0GX,0
1,A00473363TJ8YSZ3YAGG9,1
2,A00700212KB3K0MVESPIY,2
3,A0078719IR14X3NNUG0F,3
4,A01198201H0E3GHV2Z17I,4


In [15]:
i_df.head()

Unnamed: 0,asin,item_id
0,7806397051,0
1,9759091062,1
2,9788072216,2
3,9790790961,3
4,9790794231,4


In [16]:
ui_df.to_csv(os.path.join(RAW_PATH, "ui_interaction.csv"), sep='\t', index=False)

### ui_df (user_id,item_id,time) sorted by time,
### u_df (reviewerID,user_id) sorted by reviewerID,
### i_df (asin,item_id) sorted by asin

### #------------------Universal settings completed---------------------


### 2.reindex meta features (only item data)

In [17]:
import os
import pandas as pd
import gzip


os.chdir("/data/Chester/preprocessing")
os.getcwd()

DATASET="Beauty"
RAW_PATH=os.path.join('./',DATASET)
META_FILE="meta_{}.json.gz".format(DATASET)
i_id_mapping = 'i_id_mapping.csv'

In [18]:
#i_df is item mapping csv (asin,item_id)sorted by asin
i_df=pd.read_csv(os.path.join(RAW_PATH,i_id_mapping),sep='\t')
print(f'{i_id_mapping} {i_df.shape}')
i_df.head()

i_id_mapping.csv (12101, 2)


Unnamed: 0,asin,item_id
0,7806397051,0
1,9759091062,1
2,9788072216,2
3,9790790961,3
4,9790794231,4


In [19]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

i_meta_df=get_df(os.path.join(RAW_PATH, META_FILE))

print(f"item_meta_data:{i_meta_df.shape}")
i_meta_df.head()


item_meta_data:(259204, 9)


Unnamed: 0,asin,description,title,imUrl,salesRank,categories,price,related,brand
0,205616461,"As we age, our once youthful, healthy skin suc...",Bio-Active Anti-Aging Serum (Firming Ultra-Hyd...,http://ecx.images-amazon.com/images/I/41DecrGO...,{'Health & Personal Care': 461765},"[[Beauty, Skin Care, Face, Creams & Moisturize...",,,
1,558925278,Mineral Powder Brush--Apply powder or mineral ...,Eco Friendly Ecotools Quality Natural Bamboo C...,http://ecx.images-amazon.com/images/I/51L%2BzY...,{'Beauty': 402875},"[[Beauty, Tools & Accessories, Makeup Brushes ...",,,
2,733001998,"From the Greek island of Chios, this Mastiha b...",Mastiha Body Lotion,http://ecx.images-amazon.com/images/I/311WK5y1...,{'Beauty': 540255},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",,,
3,737104473,Limited edition Hello Kitty Lipstick featuring...,Hello Kitty Lustre Lipstick (See sellers comme...,http://ecx.images-amazon.com/images/I/31u6Hrzk...,{'Beauty': 931125},"[[Beauty, Makeup, Lips, Lipstick]]",,,
4,762451459,"The mermaid is an elusive (okay, mythical) cre...",Stephanie Johnson Mermaid Round Snap Mirror,http://ecx.images-amazon.com/images/I/41y2%2BF...,,"[[Beauty, Tools & Accessories, Mirrors, Makeup...",19.98,,


In [20]:
#remapping
map_dict=dict(zip(i_df['asin'],i_df["item_id"]))

i_meta_df['item_id']=i_meta_df['asin'].map(map_dict)
i_meta_df.shape

(259204, 10)

In [21]:
i_meta_df.dropna(subset=['item_id'],inplace=True)
i_meta_df.shape

(12101, 10)

In [22]:
i_meta_df['item_id']=i_meta_df['item_id'].astype(int)
i_meta_df=i_meta_df.sort_values(by=['item_id'],kind='mergesort').reset_index(drop=True)#very important
i_meta_df.head()

Unnamed: 0,asin,description,title,imUrl,salesRank,categories,price,related,brand,item_id
0,7806397051,An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA,0
1,9759091062,Xtreme Brite Brightening gel is a highly conc...,Xtreme Brite Brightening Gel 1oz.,http://ecx.images-amazon.com/images/I/41QWW9v1...,{'Beauty': 52254},"[[Beauty, Hair Care, Styling Products, Creams,...",19.99,"{'also_bought': ['B0054GLD1U', 'B003BRZCUC', '...",Xtreme Brite,1
2,9788072216,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,http://ecx.images-amazon.com/images/I/51iT2k6L...,{'Beauty': 78916},"[[Beauty, Fragrance, Women's, Eau de Parfum]]",65.86,"{'also_bought': ['B006C5OHSI', 'B006P14842', '...",Prada,2
3,9790790961,Versace Bright Crystal Perfume for Women 3 oz ...,Versace Bright Crystal Eau de Toilette Spray f...,http://ecx.images-amazon.com/images/I/418LYGLE...,{'Beauty': 764},"[[Beauty, Fragrance, Women's, Eau de Toilette]]",52.33,"{'also_bought': ['B007P7OPQQ', 'B0017JT658', '...",Versace,3
4,9790794231,STELLA For Women By STELLA MCCARTNEY 1.7 oz ED...,Stella McCartney Stella,http://ecx.images-amazon.com/images/I/31L2n60J...,{'Beauty': 142503},"[[Beauty, Fragrance, Women's, Eau de Parfum]]",,"{'also_bought': ['B0019M21OQ', 'B000E7YM8K', '...",,4


In [23]:
origin_cols=i_meta_df.columns.tolist()

target_cols=[origin_cols[-1]]+origin_cols[:-1]
target_cols

['item_id',
 'asin',
 'description',
 'title',
 'imUrl',
 'salesRank',
 'categories',
 'price',
 'related',
 'brand']

In [24]:
target_i_df=i_meta_df[target_cols]
target_i_df.to_csv(os.path.join(RAW_PATH,'i_meta_{}.csv'.format(DATASET)),index=False)
target_i_df.head()

Unnamed: 0,item_id,asin,description,title,imUrl,salesRank,categories,price,related,brand
0,0,7806397051,An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
1,1,9759091062,Xtreme Brite Brightening gel is a highly conc...,Xtreme Brite Brightening Gel 1oz.,http://ecx.images-amazon.com/images/I/41QWW9v1...,{'Beauty': 52254},"[[Beauty, Hair Care, Styling Products, Creams,...",19.99,"{'also_bought': ['B0054GLD1U', 'B003BRZCUC', '...",Xtreme Brite
2,2,9788072216,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,http://ecx.images-amazon.com/images/I/51iT2k6L...,{'Beauty': 78916},"[[Beauty, Fragrance, Women's, Eau de Parfum]]",65.86,"{'also_bought': ['B006C5OHSI', 'B006P14842', '...",Prada
3,3,9790790961,Versace Bright Crystal Perfume for Women 3 oz ...,Versace Bright Crystal Eau de Toilette Spray f...,http://ecx.images-amazon.com/images/I/418LYGLE...,{'Beauty': 764},"[[Beauty, Fragrance, Women's, Eau de Toilette]]",52.33,"{'also_bought': ['B007P7OPQQ', 'B0017JT658', '...",Versace
4,4,9790794231,STELLA For Women By STELLA MCCARTNEY 1.7 oz ED...,Stella McCartney Stella,http://ecx.images-amazon.com/images/I/31L2n60J...,{'Beauty': 142503},"[[Beauty, Fragrance, Women's, Eau de Parfum]]",,"{'also_bought': ['B0019M21OQ', 'B000E7YM8K', '...",


In [25]:
uni_items=target_i_df['item_id'].unique()
print(f'unique items:{len(uni_items)}')
print(f'min/max of unique items:{min(uni_items)},{max(uni_items)}')

unique items:12101
min/max of unique items:0,12100


### 3.feature encoder

In [26]:
import os
import numpy as np
import pandas as pd
import random
import torch

seed=123

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

os.chdir("/data/Chester/preprocessing")
print(os.getcwd())

/data/Chester/preprocessing


In [27]:
DATASET="Beauty"
RAW_PATH=os.path.join('./',DATASET)
i_meta_df_name='i_meta_{}.csv'.format(DATASET)

item_meta_file=os.path.join(RAW_PATH,i_meta_df_name)
i_meta_df=pd.read_csv(item_meta_file)
i_meta_df.shape

(12101, 10)

In [28]:
i_meta_df.head()

Unnamed: 0,item_id,asin,description,title,imUrl,salesRank,categories,price,related,brand
0,0,7806397051,An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
1,1,9759091062,Xtreme Brite Brightening gel is a highly conc...,Xtreme Brite Brightening Gel 1oz.,http://ecx.images-amazon.com/images/I/41QWW9v1...,{'Beauty': 52254},"[['Beauty', 'Hair Care', 'Styling Products', '...",19.99,"{'also_bought': ['B0054GLD1U', 'B003BRZCUC', '...",Xtreme Brite
2,2,9788072216,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,http://ecx.images-amazon.com/images/I/51iT2k6L...,{'Beauty': 78916},"[['Beauty', 'Fragrance', ""Women's"", 'Eau de Pa...",65.86,"{'also_bought': ['B006C5OHSI', 'B006P14842', '...",Prada
3,3,9790790961,Versace Bright Crystal Perfume for Women 3 oz ...,Versace Bright Crystal Eau de Toilette Spray f...,http://ecx.images-amazon.com/images/I/418LYGLE...,{'Beauty': 764},"[['Beauty', 'Fragrance', ""Women's"", 'Eau de To...",52.33,"{'also_bought': ['B007P7OPQQ', 'B0017JT658', '...",Versace
4,4,9790794231,STELLA For Women By STELLA MCCARTNEY 1.7 oz ED...,Stella McCartney Stella,http://ecx.images-amazon.com/images/I/31L2n60J...,{'Beauty': 142503},"[['Beauty', 'Fragrance', ""Women's"", 'Eau de Pa...",,"{'also_bought': ['B0019M21OQ', 'B000E7YM8K', '...",


In [29]:
#sentences:title+brand+category+description | ALL have title+description
title_na_df=i_meta_df[i_meta_df['title'].isnull()]
print("title null:",title_na_df.shape)

price_na_df=i_meta_df[i_meta_df['price'].isnull()]
print("price null:",price_na_df.shape)

imUrl_na_df=i_meta_df[i_meta_df['imUrl'].isnull()]
print("imUrl null:",imUrl_na_df.shape)

brand_na_df=i_meta_df[i_meta_df['brand'].isnull()]
print('brand null:',brand_na_df.shape)

categories_na_df=i_meta_df[i_meta_df['categories'].isnull()]
print("categories null:",categories_na_df.shape)

description_na_df=i_meta_df[i_meta_df['description'].isnull()]
print("description null:",description_na_df.shape)


title null: (7, 10)
price null: (585, 10)
imUrl null: (7, 10)
brand null: (2098, 10)
categories null: (0, 10)
description null: (939, 10)


### Text feature processing

In [30]:
i_meta_df['title']=i_meta_df['title'].fillna(" ")
i_meta_df['brand']=i_meta_df['brand'].fillna(" ")
i_meta_df['description']=i_meta_df['description'].fillna(" ")

In [31]:
#-------------Text Features------------------
#remove part html:
import re

#---------------re remove html---------------------
pattern = re.compile(r'<.*?>',re.S)

clean_sentences=[]
max_sentences_len=0
min_sentences_len=9999
sum_sentences_len=0
sentences_n=0

for i,row in i_meta_df.iterrows():
    sen=row['title']+" "+row['brand']+" "
    cates=eval(row['categories'])
    if isinstance(cates,list):
        for c in cates[0]:
            sen=sen+c+" "
    sen+=row["description"]
    #----------remove html-------------
    sen = pattern.sub('', sen)
    #----------------------------------
    sen=sen.replace('\n'," ")

    sen_len=len(sen.split(" "))
    max_sentences_len= sen_len if sen_len>max_sentences_len else max_sentences_len
    min_sentences_len=sen_len if sen_len<min_sentences_len else min_sentences_len
    sum_sentences_len+=sen_len

    sentences_n+=1
    clean_sentences.append(sen)

print(len(clean_sentences),sentences_n)
print(f"max_sentences_len:{max_sentences_len}")
print(f"min_sentences_len:{min_sentences_len}")
print(f"avg_sentences_len:{sum_sentences_len/sentences_n}")
#sum sentences avg.len to 77 clip limited,so use bert

12101 12101
max_sentences_len:1254
min_sentences_len:8
avg_sentences_len:88.5107015949095


In [32]:
### Hugging face bert
from transformers import BertModel,BertTokenizer
import torch
import os

model_path=os.path.join(os.getcwd(),'bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained(model_path)
model=BertModel.from_pretrained(model_path)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.2/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 112
CUDA SETUP: Loading binary /home/Chester/.conda/envs/graph_llm/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda112.so...


  warn(msg)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [33]:
model.device

device(type='cuda', index=0)

In [34]:
if not os.path.isdir('./text_temp'):
    os.mkdir("./text_temp")

batch=32
debug_sentences=clean_sentences
n=len(debug_sentences)//batch  #cuda OOM

assert len(debug_sentences)>=batch

if not os.path.isdir('./text_temp'):
    os.mkdir("./text_temp")

for i in range(n if len(debug_sentences)%batch==0 else n+1):
    print(f"{i*batch}-{(i+1)*batch}")

    i_batch=debug_sentences[i*batch:(i+1)*batch]
    i_ids_temp = tokenizer(i_batch, max_length=512,truncation=True,padding="max_length",return_tensors='pt')
    i_ids_temp.to(device)
    with torch.no_grad():
        i_bert_output = model(**i_ids_temp)
    i_cls=i_bert_output.last_hidden_state[:,0,:].to("cpu")
    torch.save(i_cls,f"./text_temp/{i}.pt")
    print(i_cls.shape)

    del i_ids_temp,i_bert_output,i_cls
    torch.cuda.empty_cache()

test_all=[]
for i in range(n if len(debug_sentences)%batch==0 else n+1):
    x=torch.load(f"./text_temp/{i}.pt")
    test_all.append(x)

text_features=torch.cat(test_all,dim=0)
torch.save(text_features,os.path.join(RAW_PATH,"text_feat.pt"))

import shutil
shutil.rmtree("./text_temp")

0-32
torch.Size([32, 768])
32-64
torch.Size([32, 768])
64-96
torch.Size([32, 768])
96-128
torch.Size([32, 768])
128-160
torch.Size([32, 768])
160-192
torch.Size([32, 768])
192-224
torch.Size([32, 768])
224-256
torch.Size([32, 768])
256-288
torch.Size([32, 768])
288-320
torch.Size([32, 768])
320-352
torch.Size([32, 768])
352-384
torch.Size([32, 768])
384-416
torch.Size([32, 768])
416-448
torch.Size([32, 768])
448-480
torch.Size([32, 768])
480-512
torch.Size([32, 768])
512-544
torch.Size([32, 768])
544-576
torch.Size([32, 768])
576-608
torch.Size([32, 768])
608-640
torch.Size([32, 768])
640-672
torch.Size([32, 768])
672-704
torch.Size([32, 768])
704-736
torch.Size([32, 768])
736-768
torch.Size([32, 768])
768-800
torch.Size([32, 768])
800-832
torch.Size([32, 768])
832-864
torch.Size([32, 768])
864-896
torch.Size([32, 768])
896-928
torch.Size([32, 768])
928-960
torch.Size([32, 768])
960-992
torch.Size([32, 768])
992-1024
torch.Size([32, 768])
1024-1056
torch.Size([32, 768])
1056-1088
torch

In [35]:
text_features.shape

torch.Size([12101, 768])

In [36]:
#sample random text to contrast lable
model.to("cpu")
nnn=64
random_i_feature=tokenizer(clean_sentences[-10:],max_length=512,truncation=True,padding="max_length",return_tensors='pt')
random_i_feature=model(**random_i_feature)
random_i_feature=random_i_feature.last_hidden_state[:,0,:]

print(torch.allclose(random_i_feature,text_features[-10:],1e-1))
print(random_i_feature==text_features[-10:])

# torch.allclose(text_features,zz,atol=1e-5)

True
tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])


In [37]:
model.to("cpu")
del model
torch.cuda.empty_cache()

### Image Feature processing

In [38]:
#-------------Image Features------------------
imUrl_na_df=i_meta_df[i_meta_df['imUrl'].isnull()]
print(f"image NaN count:{len(imUrl_na_df[['item_id','imUrl']])}")
print(imUrl_na_df[['item_id',"imUrl"]])

image NaN count:7
       item_id imUrl
1460      1460   NaN
3578      3578   NaN
4208      4208   NaN
4381      4381   NaN
4398      4398   NaN
10771    10771   NaN
10990    10990   NaN


In [39]:
list_n=0
string_n=0
for i,row in i_meta_df.iterrows():
    if isinstance(row['imUrl'],list):
        list_n+=1
    if isinstance(row['imUrl'],str):
        string_n+=1
list_n
#Some origin item has many image url like:[....jpg,....jpg,....jpg]

0

In [40]:
len(i_meta_df['imUrl'])

12101

In [41]:
i_meta_df['imUrl']

0        http://ecx.images-amazon.com/images/I/41Rn18Oe...
1        http://ecx.images-amazon.com/images/I/41QWW9v1...
2        http://ecx.images-amazon.com/images/I/51iT2k6L...
3        http://ecx.images-amazon.com/images/I/418LYGLE...
4        http://ecx.images-amazon.com/images/I/31L2n60J...
                               ...                        
12096    http://ecx.images-amazon.com/images/I/61enVb2X...
12097    http://ecx.images-amazon.com/images/I/41q7jpgt...
12098    http://ecx.images-amazon.com/images/I/412qdoPc...
12099    http://ecx.images-amazon.com/images/I/31JTTyCU...
12100    http://ecx.images-amazon.com/images/I/41up5%2B...
Name: imUrl, Length: 12101, dtype: object

In [42]:
import re

# pattern = re.compile('(\._).*(_\.)',re.S)
# suffix=pattern.findall(row['imUrl'])


img_list=[]
count_NaN=0
count_suffix=0

resolution_suffix= {'._SX300_.':0,'._SY300_.':0,'no_suffix':0}
for i,row in i_meta_df.iterrows():
    # print(type(row['imUrl']))
    if isinstance(row['imUrl'],float):
        count_NaN+=1
        continue
    if isinstance(row['imUrl'],str):

        if '._SX300_.' in row['imUrl']:
            resolution_suffix['._SX300_.']+=1
        elif '._SY300_.' in row['imUrl']:
            resolution_suffix['._SY300_.']+=1
        else:
            print(i,row['imUrl'])
            resolution_suffix['no_suffix']+=1
        count_suffix+=1

print(f"NaN:{count_NaN},{resolution_suffix}:{count_suffix},suffix_sum:{sum([resolution_suffix[i] for i in resolution_suffix])},item_n:{len(i_meta_df)}")

18 http://ecx.images-amazon.com/images/I/11ZEXJHC1VL.jpg
21 http://ecx.images-amazon.com/images/I/11F85AXJ9JL.jpg
30 http://ecx.images-amazon.com/images/I/21HN33EM9CL.jpg
34 http://ecx.images-amazon.com/images/I/41cO9Od7ZlL.jpg
35 http://ecx.images-amazon.com/images/I/21S55630NJL.jpg
36 http://ecx.images-amazon.com/images/I/212D22J613L.jpg
41 http://ecx.images-amazon.com/images/I/1169PW0QSKL.jpg
48 http://ecx.images-amazon.com/images/I/21FG5TC6FAL.jpg
49 http://ecx.images-amazon.com/images/I/118CZV58ZFL.jpg
54 http://ecx.images-amazon.com/images/I/21C6Y9AMWPL.jpg
59 http://ecx.images-amazon.com/images/I/11TFMG6Y9NL.jpg
60 http://ecx.images-amazon.com/images/I/21J2jTmvnsL.jpg
68 http://ecx.images-amazon.com/images/I/11AK1M6GSPL.jpg
87 http://ecx.images-amazon.com/images/I/11A4FZ4032L.jpg
109 http://ecx.images-amazon.com/images/I/21MVF6ESEEL.jpg
115 http://ecx.images-amazon.com/images/I/21DPnGiPj7L.jpg
116 http://ecx.images-amazon.com/images/I/31xUb0FGiZL.jpg
143 http://ecx.images-amazon

In [43]:
#nan unvalid ---------------------=======features Mean


from PIL import Image
import requests
from transformers import AutoProcessor, CLIPModel
import  time

model = CLIPModel.from_pretrained("./clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("./clip-vit-base-patch32")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

DATASET="Beauty"
RAW_PATH=os.path.join('./',DATASET)
Image_PATH=os.path.join(RAW_PATH,'images')

img_nan_id=[]
img_bad_id=[]
bad_img_count=0

img_features=[]

if not os.path.isdir(Image_PATH):
    os.mkdir(Image_PATH)

start=time.time()

for i,row in i_meta_df.iterrows():
    if i%50==0:
        if i==0:
            t1=time.time()
        else:
            print("iteration:",i,"\t\ttime:{:.2f}s".format(time.time()-t1))
            t1=time.time()
    if row['imUrl'] is np.nan:
        print(f"{row['item_id']} is nan")
        img_nan_id.append(row['item_id'])
        img_features.append(torch.zeros(1,512).to(device))
    else:
        try:
            image=Image.open(requests.get(row['imUrl'],stream=True).raw)
            image.save(os.path.join(Image_PATH,str(row['item_id']))+".jpg")
            inputs = processor(images=image, return_tensors="pt")
            inputs.to(device)
            with torch.no_grad():
                img_ft = model.get_image_features(**inputs)
            img_features.append(img_ft)
        except:
            print(f'{row["item_id"]}:{row["imUrl"]} is a invalid url!')
            img_bad_id.append(row['item_id'])
            bad_img_count+=1
            img_features.append(torch.zeros(1,512).to(device))

print("abstract time ends.Total time:{:.2f}s".format(time.time()-start))

pt_img_features=torch.stack(img_features)
pt_img_features=pt_img_features.squeeze()
pt_img_mean=torch.mean(pt_img_features,dim=0)
print(pt_img_features.shape)

for i in img_nan_id:
    pt_img_features[i]=pt_img_mean

for i in img_bad_id:
    pt_img_features[i]=pt_img_mean


print(f"invalid image url count:{bad_img_count}")

torch.save(pt_img_features.to("cpu"),os.path.join(RAW_PATH,"img_feat.pt"))
print("success!")

iteration: 50 		time:19.34s
iteration: 100 		time:17.55s
iteration: 150 		time:16.61s
iteration: 200 		time:23.23s
iteration: 250 		time:25.91s
iteration: 300 		time:25.11s
iteration: 350 		time:26.79s
iteration: 400 		time:25.12s
iteration: 450 		time:25.86s
iteration: 500 		time:25.64s
iteration: 550 		time:25.31s
iteration: 600 		time:24.81s
iteration: 650 		time:24.80s
iteration: 700 		time:25.08s
725:http://ecx.images-amazon.com/images/I/41jAeco8PWL._SY300_.jpg is a invalid url!
727:http://ecx.images-amazon.com/images/I/31P3XECD3FL._SY300_.jpg is a invalid url!
iteration: 750 		time:25.62s
iteration: 800 		time:24.11s
834:http://g-ecx.images-amazon.com/images/G/01/x-site/icons/no-img-sm._CB192198896_.gif is a invalid url!
iteration: 850 		time:25.04s
iteration: 900 		time:24.98s
iteration: 950 		time:27.30s
iteration: 1000 		time:24.03s
1046:http://ecx.images-amazon.com/images/I/21x%2BHtGtwyL._SY300_.jpg is a invalid url!
iteration: 1050 		time:25.78s
iteration: 1100 		time:24.87s

### Test feature match!

In [27]:
def test_image(url,n):
    image=Image.open(requests.get(url,stream=True).raw)

    inputs = processor(images=image, return_tensors="pt")
    inputs.to(device)
    with torch.no_grad():
        img_ft = model.get_image_features(**inputs)
    print(pt_img_features[n].to("cpu")==img_ft.to("cpu"))

In [28]:
test_image("http://ecx.images-amazon.com/images/I/21iMxsyDBRL._SX300_.jpg",0)

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, T

In [23]:
i_meta_df['imUrl'][0]

'http://ecx.images-amazon.com/images/I/21iMxsyDBRL._SX300_.jpg'

In [33]:
text_features=torch.load(os.path.join(RAW_PATH,"text_feat.pt"))
text_features.shape

torch.Size([18357, 768])

In [35]:
len(clean_sentences)

18357

In [36]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
random_i_feature=tokenizer(clean_sentences[-10:],max_length=512,truncation=True,padding="max_length",return_tensors='pt')
random_i_feature=model(**random_i_feature)
random_i_feature=random_i_feature.last_hidden_state[:,0,:]

In [60]:
def test_text(a,b=None):
    if b is not None:
        temp=tokenizer(clean_sentences[a:b],max_length=512,truncation=True,padding="max_length",return_tensors='pt')
        temp=model(**temp)
        temp=temp.last_hidden_state[:,0,:]
        print(torch.allclose(temp,text_features[a:b],1e-1))
        print(temp==text_features[a:b])

        print(temp)
        print(text_features[a:b])
    else:
        temp=tokenizer(clean_sentences[a],max_length=512,truncation=True,padding="max_length",return_tensors='pt')
        temp=model(**temp)
        temp=temp.last_hidden_state[:,0,:]
        print(torch.allclose(temp,text_features[a],1e-1))
        print(temp==text_features[a])

        print(temp)
        print(text_features[a])

In [65]:
test_text(-5,-1)

True
tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])
tensor([[-0.5981, -0.5338,  0.0445,  ..., -0.1491,  0.0514, -0.1792],
        [-0.4502, -0.0870,  0.1855,  ..., -0.4759,  0.4727,  0.2527],
        [-0.8182, -0.4949, -0.4483,  ..., -0.6364,  0.0677, -0.0387],
        [-0.6295, -0.4640, -0.1709,  ..., -0.6600, -0.0348, -0.0604]],
       grad_fn=<SliceBackward0>)
tensor([[-0.5981, -0.5338,  0.0445,  ..., -0.1491,  0.0514, -0.1792],
        [-0.4502, -0.0870,  0.1855,  ..., -0.4759,  0.4727,  0.2527],
        [-0.8182, -0.4949, -0.4484,  ..., -0.6364,  0.0677, -0.0387],
        [-0.6294, -0.4640, -0.1709,  ..., -0.6600, -0.0348, -0.0604]],
       grad_fn=<SliceBackward0>)


In [51]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  