In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import tqdm

import numpy as np
import pandas as pd

from dataloader import AllBeautyLoader, AllBeautyLoaderMeta

# Расширение

В этом ноутбуке мы расширим наш датасет пользовательских отзывов двумя новыми переменными:
<ul>
    <li><b>Категория товара</b> - добавим категорию товара, на который был оставлен отзыв</li>
    <li><b>Тональность отзыва</b> - бинарный признак, отвечающий на вопрос "текст комментария имеет положительный окрас"?</li>
</ul>

# ✅Начнем с создания исходного датасета

Создадим и сохраним сначала используемый датасет с отзывами

In [2]:
# настроим пути входящих и выходящих файлов
main_path = '/home/roman/Документы/AmazonRecomendationSystem'
input_path = '/home/roman/Документы/AmazonRecomendationSystem/data/raw/all_beauty/All_Beauty.jsonl'
output_path = '/home/roman/Документы/AmazonRecomendationSystem/data/interim/'

In [3]:
# Создаем экземпляр класса, укажем свой размер выборки и свое семя случайной генерации
# Семя выбиралось на основании распределения рейтингов и факта покупки по выборке
loader = AllBeautyLoader(input_path, sample_size=10000, seed=100)

# Получаем выборку данных
sample_df = loader.get_sample()

loader.save_sample(output_path, 'csv')

Загрузка данных: 100%|████████████████| 10000/10000 [00:00<00:00, 148885.18it/s]

Данные уже существуют





In [4]:
# загрузим оригинальную выборку
df = pd.read_csv(os.path.join(output_path, 'sample_size_10000_seed_100_used_True.csv'))
df.head()

Unnamed: 0.1,Unnamed: 0,rating,title,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,56088,5.0,They are cute,They are very beautiful but wouldn't recommend...,B01H6QBHYI,B01H6QBHYI,AHFYD2BAJG7VV76FMGOPGPWAXN4Q,1483996385000,0,True
1,210755,1.0,None of the bundles weighted 100 grams,It sheds like crazy. None of the bundles weigh...,B08G8JRG6Z,B08G8JRG6Z,AHMPDNXTLQJXHSMDKAP6U3WF347Q,1621127781266,0,True
2,65615,1.0,One Star,Melts as one sweats<br />Didn't work for me,B00FE90M0K,B09W66MSPX,AFTDUFBO6E4WQ4FTJY2RXEQFA5NQ,1461125448000,1,True
3,212912,5.0,Great Product,"Amazing product, just a small amount will make...",B01BLQ5D1C,B01BLQ5D1C,AGYGXYL3FZI4YPQRVXPLJH5NJWYA,1492783897000,0,False
4,344414,5.0,Love it!,Two heat settings... Keeps my lashes curled al...,B0BFWBKRSG,B0C36NBBH8,AFUOQ6GPUNX24X5ZFHYVZWO434MA,1673199516626,2,True


In [5]:
# оставим только интересующие нас переменные
keep = [
    'user_id',
    'asin',
    'text',
    'rating',
    'helpful_vote',
    'verified_purchase'
]

df = df[keep]
df.head()

Unnamed: 0,user_id,asin,text,rating,helpful_vote,verified_purchase
0,AHFYD2BAJG7VV76FMGOPGPWAXN4Q,B01H6QBHYI,They are very beautiful but wouldn't recommend...,5.0,0,True
1,AHMPDNXTLQJXHSMDKAP6U3WF347Q,B08G8JRG6Z,It sheds like crazy. None of the bundles weigh...,1.0,0,True
2,AFTDUFBO6E4WQ4FTJY2RXEQFA5NQ,B00FE90M0K,Melts as one sweats<br />Didn't work for me,1.0,1,True
3,AGYGXYL3FZI4YPQRVXPLJH5NJWYA,B01BLQ5D1C,"Amazing product, just a small amount will make...",5.0,0,False
4,AFUOQ6GPUNX24X5ZFHYVZWO434MA,B0BFWBKRSG,Two heat settings... Keeps my lashes curled al...,5.0,2,True


In [6]:
grouped = df.groupby(by=['asin']).count()
grouped[['user_id']]

Unnamed: 0_level_0,user_id
asin,Unnamed: 1_level_1
069267599X,2
9533223618,1
9788077587,1
B000050FDB,1
B000050FDE,4
...,...
B0C2CLK5XS,1
B0C2Z7K2PH,1
B0C3L8XXWQ,1
B0C6SW8K9Q,1


In [7]:
grouped[['user_id']].info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 7479 entries, 069267599X to B0CB2Y66H2
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   user_id  7479 non-null   int64
dtypes: int64(1)
memory usage: 547.8 KB


# ✅Добавим информацию товара

In [8]:
# настроим пути входящих и выходящих файлов
input_path = '/home/roman/Документы/AmazonRecomendationSystem/data/raw/all_beauty/meta_All_Beauty.jsonl'
output_path = '/home/roman/Документы/AmazonRecomendationSystem/data/interim/'

In [9]:
ids = grouped[['user_id']]
ids = list(ids.index.values)

In [10]:
# Создаем экземпляр класса, укажем свой размер выборки и свое семя случайной генерации
# Семя выбиралось на основании распределения рейтингов и факта покупки по выборке
loader = AllBeautyLoaderMeta(input_path)

# Получаем выборку данных
sample_df = loader.get_all()
sample_df.columns = ['title', 'average_rating', 'rating_number', 'store', 'asin']

# loader.save_sample(output_path, 'csv')

Загрузка данных: 100%|███████████████| 112590/112590 [00:01<00:00, 72762.99it/s]


In [11]:
sample_df.head()

Unnamed: 0,title,average_rating,rating_number,store,asin
0,"Howard LC0008 Leather Conditioner, 8-Ounce (4-...",4.8,10,Howard Products,B01CUPMQZE
1,Yes to Tomatoes Detoxifying Charcoal Cleanser ...,4.5,3,Yes To,B076WQZGPM
2,Eye Patch Black Adult with Tie Band (6 Per Pack),4.4,26,Levine Health Products,B000B658RI
3,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4...",3.1,102,Cherioll,B088FKY3VD
4,Precision Plunger Bars for Cartridge Grips – 9...,4.3,7,Precision,B07NGFDN6G


In [12]:
sample_df = df.merge(sample_df, how='inner', on='asin')

In [13]:
sample_df.head()

Unnamed: 0,user_id,asin,text,rating,helpful_vote,verified_purchase,title,average_rating,rating_number,store
0,AHFYD2BAJG7VV76FMGOPGPWAXN4Q,B01H6QBHYI,They are very beautiful but wouldn't recommend...,5.0,0,True,eBoot 40 Pack Wedding Bridal Pearl Flower Crys...,4.6,1998,EBOOT
1,AHMPDNXTLQJXHSMDKAP6U3WF347Q,B08G8JRG6Z,It sheds like crazy. None of the bundles weigh...,1.0,0,True,613 Blonde Bundles Human Hair 9A Brazilian Bod...,2.9,2,Guanyuwigs
2,AGYGXYL3FZI4YPQRVXPLJH5NJWYA,B01BLQ5D1C,"Amazing product, just a small amount will make...",5.0,0,False,"DHC Deep Cleansing Oil 6.7 fl. oz., includes 1...",4.4,139,
3,AHZLBGLKFCOWPPTGE3NXC6TBAWPA,B07SXTWWM5,Smells fresh,5.0,2,True,"Inspired by Creed Silver Mountain Water, 1.7oz...",4.0,317,Fragrance Club Genealogy Collection
4,AHZSYERCKEQVIQYJT6UNVUT56SHQ,B07Q7FXWCY,I love itttt!!!! Will order some more again!,5.0,0,True,"False Eyelashes 3D Lashes Pack, Fur Long Lashe...",4.2,936,ALICROWN HAIR


In [13]:
sample_df.shape

(9134, 10)

In [14]:
# сохраним промежуточный результат
sample_df.to_csv(os.path.join(output_path, 'reviews_with_goods.csv'))
sample_df[['title', 'average_rating', 'rating_number', 'store', 'asin']].to_csv(os.path.join(output_path, 'goods_for_reviews.csv'))

# ✅Добавим тональность

In [8]:
sample_df = pd.read_csv(os.path.join(output_path, 'reviews_with_goods.csv'))
tonality = pd.read_csv(os.path.join(main_path, 'data/processed/reviews_tonality_dataset.csv'))
tonality_idx = pd.read_csv(os.path.join(main_path, 'data/processed/reviews_idx.csv'))
tonality = pd.concat([tonality, tonality_idx], axis=1)

# extended_tonality_final = pd.concat([
#     sample_df.drop(['text', 'title', 'Unnamed: 0', 'store'], axis=1), tonality['tonality'], tonality.drop(['tonality', 'Unnamed: 0'], axis=1)
# ], axis=1)

# extended_tonality_final = extended_tonality_final[extended_tonality_final['tonality'].isna() == False]
drop_cols = ['text', 'title', 'Unnamed: 0', 'store']
extended_tonality_final = sample_df.drop(drop_cols, axis=1).merge(tonality.drop(['Unnamed: 0'], axis=1), how='inner', on=['user_id'])
print(extended_tonality_final.shape, extended_tonality_final['tonality'].isna().any().any())
extended_tonality_final.head()

(8602, 776) False


Unnamed: 0,user_id,asin,rating,helpful_vote,verified_purchase,average_rating,rating_number,embeded_feature_0,embeded_feature_1,embeded_feature_2,...,embeded_feature_759,embeded_feature_760,embeded_feature_761,embeded_feature_762,embeded_feature_763,embeded_feature_764,embeded_feature_765,embeded_feature_766,embeded_feature_767,tonality
0,AHFYD2BAJG7VV76FMGOPGPWAXN4Q,B01H6QBHYI,5.0,0,True,4.6,1998,0.122873,-0.061664,0.236979,...,-0.513983,0.120056,-0.237146,0.258786,0.031943,-0.160633,-0.219851,0.33339,0.201015,1
1,AHMPDNXTLQJXHSMDKAP6U3WF347Q,B08G8JRG6Z,1.0,0,True,2.9,2,0.004259,-0.138209,-0.00772,...,-0.206801,0.188669,-0.025847,0.131326,0.07737,-0.192199,-0.047045,0.194727,0.207501,0
2,AHZLBGLKFCOWPPTGE3NXC6TBAWPA,B07SXTWWM5,5.0,2,True,4.0,317,-0.109525,-0.063282,0.077964,...,-0.192878,0.026932,0.130309,0.33788,-0.057307,-0.111306,-0.198737,0.24259,0.247236,1
3,AHZSYERCKEQVIQYJT6UNVUT56SHQ,B07Q7FXWCY,5.0,0,True,4.2,936,0.006592,0.034425,0.258932,...,-0.273252,0.060454,-0.054878,0.080458,0.09832,-0.121203,0.021025,0.267021,0.261116,1
4,AEQXS33NZQHYQHLFOLB7VYKYADHA,B07FYQKMYS,5.0,2,True,3.9,106,0.148196,-0.118884,0.02132,...,-0.442889,0.076545,-0.146909,0.301168,0.103775,-0.078422,-0.138381,0.410158,0.095662,1


In [9]:
# сохраним результат
extended_tonality_final.to_csv(os.path.join(output_path, 'reviews_final.csv'))