In [None]:
import sys
sys.path.append('../')

import collections
import os
import random
from pathlib import Path
import logging
import shutil
import time
from packaging import version
from collections import defaultdict

from tqdm import tqdm
import numpy as np
import gzip
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.backends.cudnn as cudnn


# This file contains utility functions for loading and saving data, parsing files, and other common tasks.

import pickle

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)


def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
        
import json

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)
    
def ReadLineFromFile(path): # This function is used to read lines from a file and return a list of lines
    lines = []
    with open(path,'r') as fd:
        for line in fd:
            lines.append(line.rstrip('\n'))
    return lines

def parse(path):  # This function is used to parse the data file into a list of dictionaries for each review text
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

In [None]:
data_splits = load_pickle('../../Data/data/beauty/rating_splits_augmented.pkl')
test_review_data = data_splits['test']
train_review_data = data_splits['train']
valid_review_data = data_splits['val']

#i want to know the keys in the data_splits
print(data_splits.keys())

In [None]:
print('train_review_data', len(train_review_data))
print('valid_review_data', len(valid_review_data))
print('test_review_data', len(test_review_data))

In [None]:
train_review_data[9]

In [None]:
import pickle

pkl_path = '/data2/home/shyamsg/Final_Project/Data/data/beauty/exp_splits.pkl'

with open(pkl_path, 'rb') as f:
    data = pickle.load(f)

print(type(data))
if isinstance(data, dict):
    print(data.keys())

test_data = data.get('test', None)
print(type(test_data))


#go into each entry of test data which is a list of dictionaries and print thr value associated with the key 'explanation'
for i, item in enumerate(test_data[:5]):
    print(f"Item {i}:\n", item, '\n')
    # if 'explanation' in item:
    #     print(f"Explanation: {item['explanation']}\n")
    # if 'reviewText' in item:
    #     print(f"Review: {item['reviewText']}\n")
    # else:
    #     print("No explanation found.\n")



<class 'dict'>
dict_keys(['train', 'val', 'test'])
<class 'list'>
Item 0:
 {'reviewerID': 'A2QKXW3LDQ66P5', 'asin': 'B005X2F7KI', 'reviewerName': 'stephanie', 'helpful': [5, 6], 'reviewText': 'Absolutely great product.  I bought this for my fourteen year old niece for Christmas and of course I had to try it out, then I tried another one, and another one and another one.  So much fun!  I even contemplated keeping a few for myself!', 'overall': 5.0, 'summary': 'Perfect!', 'unixReviewTime': 1352937600, 'reviewTime': '11 15, 2012', 'explanation': 'Absolutely great product', 'feature': 'product'} 

Item 1:
 {'reviewerID': 'A3R353FUOTJEL2', 'asin': 'B005X2F7KI', 'reviewerName': 'Tina Osborne', 'helpful': [1, 1], 'reviewText': "Love the colors. Didn't get any doubles. 1 bottle was not fully closed and the bottle chipped on the neck of the bottle. But being where the break was I just closed it and it is still usable. I wouldn't recommend this for painting your full nail (It is for art), but I 

In [None]:
import pickle
import pandas as pd

# Load the pickle file
pkl_path = '/data2/home/shyamsg/Final_Project/P5-finetuning/Data/data/beauty/exp_splits.pkl'
with open(pkl_path, 'rb') as f:
    data = pickle.load(f)

# Check the splits available
print(data.keys())  # ['train', 'val', 'test']

# Convert each split to a CSV
for split_name in ['train', 'val', 'test']:
    split_data = data.get(split_name, [])
    
    # Convert list of dicts to DataFrame
    df = pd.DataFrame(split_data)

    # Save to CSV
    df.to_csv(f'{split_name}_data1.csv', index=False)
    print(f"Saved {split_name}_data1.csv with {len(df)} records.")


In [7]:
import pickle
import pandas as pd

# Load the pickle file
pkl_path = '/data2/home/shyamsg/Final_Project/P5-finetuning/Data/data/beauty/exp_splits.pkl'
with open(pkl_path, 'rb') as f:
    data = pickle.load(f)

# Check the splits available
print(data.keys())  # ['train', 'val', 'test']

# Convert each split to a CSV
for split_name in ['train', 'val', 'test']:
    split_data = data.get(split_name, [])
    
    # Convert list of dicts to DataFrame
    df = pd.DataFrame(split_data)

    # Save to CSV
    df.to_csv(f'{split_name}_data1.csv', index=False)
    print(f"Saved {split_name}_data1.csv with {len(df)} records.")


  from pandas.core.computation.check import NUMEXPR_INSTALLED


dict_keys(['train', 'val', 'test'])
Saved train_data1.csv with 106281 records.
Saved val_data1.csv with 13515 records.
Saved test_data1.csv with 13421 records.


In [13]:
train_data_path = '/data2/home/shyamsg/Final_Project/P5-finetuning/P5/notebooks/train_data1.csv'
valid_data_path = '/data2/home/shyamsg/Final_Project/P5-finetuning/P5/notebooks/val_data1.csv'
test_data_path = '/data2/home/shyamsg/Final_Project/P5-finetuning/P5/notebooks/test_data1.csv'

train_data = pd.read_csv(train_data_path)
valid_data = pd.read_csv(valid_data_path)
test_data = pd.read_csv(test_data_path)

from IPython import display
# Display the first few rows of the DataFra
display.display(train_data.head())

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,explanation,feature
0,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013",great quality,quality
1,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013",I think it does great coverage for the price I...,coverage
2,A3BTN14HIZET6Z,7806397051,"S. M. Randall ""WildHorseWoman""","[1, 2]","I was very happy to get this palette, now I wi...",5.0,Very nice palette!,1365984000,"04 15, 2013",I have normal to dry skin as I'm 59 years,skin
3,AWUO9P6PL1SY8,7806397051,TreMagnifique,"[0, 1]","Chalky,Not Pigmented,Wears off easily,Not a Co...",2.0,"Chalky, Not Pigmented, Wears off easily, Not a...",1378252800,"09 4, 2013",Does not show up on dark skinned women,skin
4,A30IP88QK3YUIO,9759091062,Amina Bint Ibraheem,"[0, 0]",I bought this product to get rid of the dark s...,3.0,Its alright,1388102400,"12 27, 2013",I bought this product to get rid of the dark s...,spots


In [14]:
#i want to drop the columns 'summary','helpful','unixReviewTime','reviewTime' from the train_data DataFrame
train_data = train_data.drop(columns=['summary','helpful','unixReviewTime','reviewTime'])
# Display the first few rows of the DataFrame after dropping the columns
display.display(train_data.head())

valid_data = valid_data.drop(columns=['summary','helpful','unixReviewTime','reviewTime'])
# Display the first few rows of the DataFrame after dropping the columns
display.display(valid_data.head())

test_data = test_data.drop(columns=['summary','helpful','unixReviewTime','reviewTime'])
# Display the first few rows of the DataFrame after dropping the columns
display.display(test_data.head())

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,explanation,feature
0,A3G6XNM240RMWA,7806397051,Karen,The texture of this concealer pallet is fantas...,4.0,great quality,quality
1,A38FVHZTNQ271F,7806397051,Nova Amor,"It was a little smaller than I expected, but t...",3.0,I think it does great coverage for the price I...,coverage
2,A3BTN14HIZET6Z,7806397051,"S. M. Randall ""WildHorseWoman""","I was very happy to get this palette, now I wi...",5.0,I have normal to dry skin as I'm 59 years,skin
3,AWUO9P6PL1SY8,7806397051,TreMagnifique,"Chalky,Not Pigmented,Wears off easily,Not a Co...",2.0,Does not show up on dark skinned women,skin
4,A30IP88QK3YUIO,9759091062,Amina Bint Ibraheem,I bought this product to get rid of the dark s...,3.0,I bought this product to get rid of the dark s...,spots


Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,explanation,feature
0,A1CQWEPJ2GH1I6,B00478VG9U,"Kathy ""Kathy""","But as I've always said, wen products are way ...",4.0,My hair is usually really dry,hair
1,A3EZHMGGWY74HY,B000TUUGAK,MommaMia,"This oil is incredible...a beautiful scent, re...",5.0,long lasting and very high quality,quality
2,A3VT0BJTKYSSTX,B001P1ZC9M,ruthm,"Works great, doesn't weigh my hair down, and s...",5.0,doesn't weigh my hair down,hair
3,A2Q16AHKCUEJJK,B009CS493U,foreveryoung,This is really pretty but it wasnt what i expe...,4.0,This is really pretty but it wasnt what i expe...,wasnt
4,AFAYSRZKWOYO3,B002UU9Q6W,Regan,Reading customer reviews has really made my ch...,5.0,a great product as well,product


Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,explanation,feature
0,A2QKXW3LDQ66P5,B005X2F7KI,stephanie,Absolutely great product. I bought this for m...,5.0,Absolutely great product,product
1,A3R353FUOTJEL2,B005X2F7KI,Tina Osborne,Love the colors. Didn't get any doubles. 1 bot...,5.0,I wouldn't recommend this for painting your fu...,nail
2,A1MQ7QN3S90UJ3,B005X2F7KI,visione26,"Wow, this is the best deal I've seen on nail p...",5.0,this is the best deal I've seen on nail polish...,polish
3,A3R9H6OKZHHRJD,9790794231,LH422,This is a very unique scent! It's deep and mys...,4.0,This is a hard scent to describe,scent
4,A2L3LZCAU4AFZC,B005XCCE4A,Debi Cook,Gorgeous! I wear wigs every day and these hold...,5.0,To get tangles out wash and gently comb with a...,comb


In [None]:
train_data.columns=['userid', 'itemid','userName','reviewText', 'rating','explanation','feature']
valid_data.columns=['userid', 'itemid','userName','reviewText', 'rating','explanation','feature']
test_data.columns=['userid', 'itemid','userName','reviewText', 'rating','explanation','feature']

# Display the first few rows of the DataFrame after renaming the columns
display.display(train_data.head())
train_data.shape

Unnamed: 0,userid,itemid,userName,reviewText,rating,explanation,feature
0,A3G6XNM240RMWA,7806397051,Karen,The texture of this concealer pallet is fantas...,4.0,great quality,quality
1,A38FVHZTNQ271F,7806397051,Nova Amor,"It was a little smaller than I expected, but t...",3.0,I think it does great coverage for the price I...,coverage
2,A3BTN14HIZET6Z,7806397051,"S. M. Randall ""WildHorseWoman""","I was very happy to get this palette, now I wi...",5.0,I have normal to dry skin as I'm 59 years,skin
3,AWUO9P6PL1SY8,7806397051,TreMagnifique,"Chalky,Not Pigmented,Wears off easily,Not a Co...",2.0,Does not show up on dark skinned women,skin
4,A30IP88QK3YUIO,9759091062,Amina Bint Ibraheem,I bought this product to get rid of the dark s...,3.0,I bought this product to get rid of the dark s...,spots


(106281, 7)

In [17]:
#save the csv files
train_data.to_csv('train_data1.csv', index=False)
valid_data.to_csv('val_data1.csv', index=False)
test_data.to_csv('test_data1.csv', index=False)

In [21]:
import pandas as pd
import json

# Load the user2id mapping from datamaps.json
datamaps_path = '/data2/home/shyamsg/Final_Project/P5-finetuning/P5/notebooks/data/beauty/datamaps.json'  # Update the path if needed
with open(datamaps_path, 'r') as f:
    data_maps = json.load(f)

user2id = data_maps['user2id']
item2id = data_maps['item2id']

# Load the CSV file
csv_path = '/data2/home/shyamsg/Final_Project/P5-finetuning/P5/notebooks/test_data1.csv'  # Update the path if needed
df = pd.read_csv(csv_path)

# Replace reviewerID with the corresponding user ID
df['userid'] = df['userid'].map(user2id)
df['itemid'] = df['itemid'].map(item2id)

# Save the updated CSV
output_path = '/data2/home/shyamsg/Final_Project/P5-finetuning/P5/notebooks/test_data1.csv'  # Update the path if needed
df.to_csv(output_path, index=False)

print(f"Updated CSV saved to {output_path}")


Updated CSV saved to /data2/home/shyamsg/Final_Project/P5-finetuning/P5/notebooks/test_data1.csv


In [None]:
import gzip
import json
import ast
file_path = "/data2/home/shyamsg/Final_Project/P5-finetuning/Data/data/beauty/meta.json.gz"


with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        try:
            data = ast.literal_eval(line)
            print(data)
        except Exception as e:
            print(f"Error parsing line {i}: {e}")
        if i >= 3:
            break

#i want to add a new column to the train_review_df and valid_review_df and test_review_df with the values from the meta.json file



{'asin': '0205616461', 'description': 'As we age, our once youthful, healthy skin succumbs to an enzymatic imbalance that wears away the cellular network, resulting in skin thinning and aging. Combining the best of nature and cosmetic biotechnology, Bio-Active products are formulated with Enzymes that gently exfoliate the skin and stimulate regeneration for a youthful glow. Benefiting from fertile orchards in the Italian countryside, Bio-active formulas are rich in phytohormones, flavonoids and fatty acids from active extracts in Apple and Pear Seeds ,enzymatically modified and developed especially for the care of aging skin. This repairing fluid helps to nourish and firm by accelerating penetration and delivery of active principles to the skin, giving it a more youthful appearance. \n\nAdvanced "Probiotic" Complex from nourishing milk proteins regains the skin\'s natural equilibrium, boosts its immunities and protects it against environmental and biological stress.\n\nPeptides and Cer

In [14]:
df.shape

(259204, 3)

In [None]:
# Flatten the outer structure
df = pd.json_normalize(data)

# Optional: If you want to convert salesRank to a flat column
df['salesRank_flat'] = df['salesRank'].apply(lambda x: list(x.items())[0] if isinstance(x, dict) else None)

# Split that into two columns
df[['salesRank_category', 'salesRank_value']] = pd.DataFrame(df['salesRank_flat'].tolist(), index=df.index)

# Drop the helper columns if not needed
df.drop(columns=['salesRank', 'salesRank_flat'], inplace=True)
