# [Amazon Reviews]

Process amazon reviews datasets

# Setup

## Library import
We import all the required Python libraries

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import os,sys
import joblib
from os import path

# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Common things
# from sklearn.metrics import classification_report
# from scipy import stats

# Visualizations
# import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set_style("white") # darkgrid, whitegrid, dark, white, and ticks
# plt.figure(figsize=(7, 7))

# Autoreload extension
# if 'autoreload' not in get_ipython().extension_manager.loaded:
#     %load_ext autoreload
    
# %autoreload 2

In [4]:
# Examples seaborn
# with sns.axes_style("whitegrid"):
#     fig, axis = plt.subplots(1, 2, figsize=(20, 5), sharey=True)
#     fig.suptitle(f'Position distribution on splits')
#     sns.boxplot(ax=axis[0], data=df_prep, y='event', x='ith_pos', order=event_label_map.values())
#     sns.boxplot(ax=axis[1], data=pd.read_pickle(path.join(DATA_PATH, "stage2_test.pkl") ), y='event', x='ith_pos', order=event_label_map.values())

## Local library import
We import all the required local libraries libraries

In [5]:
# Include local library paths
import sys
# sys.path.append('path/to/local/lib') # uncomment and fill to import local libraries
# add project folders so local libraries can be imported
sys.path.insert(1, os.path.join(os.getcwd()  , '..'))
sys.path.insert(1, os.path.join(os.getcwd()  , '../src'))

# Import local libraries
# from plibs.utils import corrstats
# from src.plibs.utils import plots as myplots

In [6]:
# notebook misc functions
def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def displayAll(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):	
        display(df)  

# import textwrap
import pprint
def wrap_print(txt):
    # print(textwrap.wrap(txt))
    print(pprint.pprint(txt, width=160))

# Parameter and functions definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [7]:
from tqdm import tqdm
from time import sleep
import traceback
import requests
from scipy import stats

In [8]:
def truncate_txt(txt, max_seq):
    ws = txt.split()
    t = ' '.join( ws[:max_seq]) if len(ws) > max_seq else txt
    return t

def hf_infer(query, txt, max_seq=480):
    try:

        output = query({
            # "inputs": [truncate_txt(t, max_seq) for t in txt] if isinstance(txt, list) else truncate_txt(txt, max_seq) ,
            "inputs": txt ,
            "parameters": {"truncation": True}
            })
        return output
    except Exception as err:
        print(f"Problem--> {err}")
        traceback.print_exc() 
    pass

def hf_infer_batch(df:pd.DataFrame, query, output_col_name, max_seq=480, chunks=None):
    chunks = chunks if chunks else max(int(len(df)/10), 1)
    rs_df = []
    for dftmp in tqdm(np.array_split(df, chunks), total=chunks):        
        try:
            dftmp[output_col_name] = hf_infer(query, txt=dftmp.reviewText.values.tolist(), max_seq=max_seq)
        except Exception as err:
            print(f"Problem--> {err}")
            # sleep(0.5)
            pass
        rs_df.append(dftmp)
    
    return rs_df


# Data import and pre-processing
Read Amazon reviews, combine meta-info and generate combined file

In [6]:
import json
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


In [7]:
df = getDF(path.join("../data/data/amazon_reviews_v2/", 'Magazine_Subscriptions_5.json.gz'))
print(df.shape)
df.head()

(2375, 12)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,4.0,True,"02 26, 2014",A5QQOOZJOVPSF,B00005N7P0,John L. Mehlmauer,"I'm old, and so is my computer. Any advice th...",Cheapskates guide,1393372800,,,
1,5.0,False,"03 6, 2004",A5RHZE7B8SV5Q,B00005N7PS,gorillazfan249,"There's nothing to say, but if you want a REAL...",The best mature Men's magazine.,1078531200,3.0,,
2,1.0,False,"07 15, 2003",A1RPTVW5VEOSI,B00005N7PS,Michael J. Edelman,If you're the kind of man who looks at himself...,THE Magazine for the Self-Centered Male,1058227200,17.0,,
3,1.0,True,"01 31, 2015",A1SFRBCMW8XVBW,B00005N7PS,Hoyett L. Barnett,Nothing to it. Just an advertisement. Little...,Nothing to it. Just an advertisement. Little a...,1422662400,,,
4,5.0,True,"10 5, 2010",A1IU9VPCBKZPE8,B00005N7P0,Randolph Eck,When PC Magazine ceased publication of their p...,Excellent Computer Magazine,1286236800,2.0,,


In [11]:
df_meta = getDF(path.join("../data/data/amazon_reviews_v2/", 'meta_Magazine_Subscriptions.json.gz'))
print(df_meta.shape)
df_meta.head()

(3385, 19)


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,"[Magazine Subscriptions, Professional & Educat...",,[REASON is edited for people interested in eco...,,"<span class=""a-size-medium a-color-secondary""","[B002PXVYLE, B01MCU84LB, B000UHI2LW, B01AKS14A...",,Reason Magazine,[],[],"[B002PXVYLE, B000UHI2LW, B01MCU84LB, B002PXW18...","{'Format:': 'Print Magazine', 'Shipping: ': 'C...",Magazine Subscriptions,,,,B00005N7NQ,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
1,"[Magazine Subscriptions, Arts, Music &amp; Pho...",,[Written by and for musicians. Covers a variet...,,"<span class=""a-size-medium a-color-secondary""","[B002PXVYGE, B0054LRNC8, B000BVEELE, B00006KC3...",,String Letter Publishers,[],742 in Magazine Subscriptions (,"[B002PXVYGE, B0054LRNC8, B00006L16A, 171906487...","{'Format:': 'Print Magazine', 'Shipping: ': 'C...",Magazine Subscriptions,,,,B00005N7OC,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
2,"[Magazine Subscriptions, Fashion &amp; Style, ...",,[Allure is the beauty expert. Every issue is f...,,"<span class=""a-size-medium a-color-secondary""","[B001THPA4O, B002PXVZWW, B001THPA1M, B001THPA1...",,Conde Nast Publications,[],[],"[B002PXVZWW, B001THPA4O, B001THPA1M, B01N819UD...","{'Format:': 'Print Magazine', 'Shipping: ': 'C...",Magazine Subscriptions,,,,B00005N7OD,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
3,"[Magazine Subscriptions, Sports, Recreation & ...",,[FLIGHT JOURNAL includes articles on aviation ...,,"<span class=""a-size-medium a-color-secondary""","[B07JVF7QW4, B00ATQ6FPY, B002G551F6, B00008CGW...",,AirAge Publishing,[],[],"[B002G551F6, B00ATQ6FPY, B00005N7PT, B001THPA2...","{'Format:': 'Print Magazine', 'Shipping: ': 'C...",Magazine Subscriptions,,,,B00005N7O9,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,"[Magazine Subscriptions, Professional & Educat...",,[RIDER is published for the road and street ri...,,"<span class=""a-size-medium a-color-secondary""","[B002PXVYD2, B01BM7TOU6, B000060MKJ, B000BNNIG...",,EPG Media & Specialty Information,[],[],"[B01BM7TOU6, B000060MKJ, B002PXVYD2, B000BNNIG...","{'Format:': 'Print Magazine', 'Shipping: ': 'C...",Magazine Subscriptions,,,,B00005N7O6,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [10]:
displayAll( df.sample(5)[["asin", "summary", "reviewText"]] )

Unnamed: 0,asin,summary,reviewText
1844,B000IOE9Y6,Love the Southern Living on my Kindle!,I love reading Southern Living on my Kindle. I have not had a problem downloading issues as other people have stated.
796,B00005NIPP,Maximum Female Fun for Young Men,"Maxim is a monthly publication aimed at the younger male population, aged 16 to 30. This magazine offers a few amusing tidbits each month, like jokes and unusual ways to make things. There is an occasional fashion article, and some monthly articles on electronic gadgets and music. But the primary reason why people buy Maxim and the main reason it sells as well as it does is the women.\nMaxim is not really an adult magazine, like Playboy or Penthouse. Its photographs do not expose so much that they should garner an X rating. But many of them come very close. From the picture on the cover (always a young woman, often with a seductive look in her eyes and much flesh exposed) to the photos in the middle, Maxim uses sex to sell. It isn't strictly a photo- op magazine for young women. There are interviews to accompany the visual effects. But the small talk is almost always geared toward sex- how to get it, how to keep getting it for a long time, how to improve your chances to get it, how to help friends get it, how to turn women on, etc.\nThe photos in Maxim are enticing, and I don't think any red- blooded American male would deny this fact. But what hurts Maxim a little bit and what keeps it from gaining as much respect as it could have is the monthly dose of stupidity that comes with each issue. Much of what you find in the ""How to"" section, for example, is idiotic and I cannot imagine anyone taking it seriously. The ""Bar Exam"" feature is also bordering on stupidity, with a set of trivia questions each month about some type of useless information.\nOverall, I think Maxim is at least an average magazine, in spite of its flaws. Yes, many of the articles are brainless and the humor is the type that would amuse a 12- year old. But the main reason people buy and read (or should I say look) at Maxim is the photographs of the women. With so much flesh exposed, without going to far, Maxim is a publication with a purely visual bend. You won't find much intelligence here, but you will find plenty of photos of scantily clad women, tempting your hormones with their youthful good looks and uninhibited personalities."
1722,B000ILY9LW,Super Easy Magazine Purchase,"This is a magazine I purchased as a present for a coworker and my mother-in-law. Recipes, home decor and health information; pretty much covers all the topics they love to read about in one magazine. Purchasing process was easy and the price was great. Plan to do it again next year."
15,B00005N7Q1,Four Stars,"I love the magazine, but the sale adds are starting to take up half the magazine."
1021,B00005R8BR,Best price through Amazon,Love this magazine


In [15]:
displayAll( df_meta.sample(5)[["category", "description", "title", "brand"]] )

Unnamed: 0,category,description,title,brand
1257,"[Magazine Subscriptions, Home & Garden, Design & Decoration]","[One of America?s most beautiful magazines featuring tours of exquisite homes and gardens, decorating and renovation ideas, entertaining and collecting.]","<span class=""a-size-medium a-color-secondary""",Meredith
2445,"[Magazine Subscriptions, Business & Investing, International]","[Provides timely and creative analyses of critical world issues such as arms control, military strategy, nuclear weapons proliferation, foreign defense policies, satellite warfare, and the peace movement.]","<span class=""a-size-medium a-color-secondary""",Mass Inst of Technology Press
376,"[Magazine Subscriptions, Professional & Educational Journals, Professional & Trade, Humanities & Social Sciences, Economics & Economic Theory]","[A trade magazine covering the business side of bowhunting and archery including trade shows, industry statistics, marketing and product trends, new products, tips for better business management, and effective retailing.]","<span class=""a-size-medium a-color-secondary""",Grand View Media Group
2704,[],"[Bon App&#xE9;tit is America's #1 food and entertaining magazine. You'll enjoy great menus, cozy dinners, great advice and much more! Each issue is filled with delicious time-saving recipes, easy and elegant entertaining ideas, world class restaurant dishes made simple, and topped off by wine reviews and recommendations., <a href=""/gp/help/customer/display.html/ref=hp_left_ac?ie=UTF8&nodeId=201293140"" target=""_blank"">Learn more</a> about auto-renewal subscriptions on Amazon.com]","<span class=""a-size-medium a-color-secondary""",Cond Nast
188,"[Magazine Subscriptions, Travel, City &amp; Regional, United States, Northeast]","[SPOTLIGHT MAGAZINE is a general interest publication for residents of Westchester and Rockland counties of New York, Northern New Jersey, Southern Connecticut, Manhattan, Long Island and the Hudson Valley. It focuses on the current issues, the innovative trends, the big people on the scene, what to do and where to go in the tri-state area, restaurants, theater and movie reviews, public television listings, new store openings, political coverage, sports, issues commentary, celebrity profiles, health updates and special features for those seeking the best in fine food, clothing, furniture and homes ., , ]","<span class=""a-size-medium a-color-secondary""",Today Media


In [109]:
dftmp = df_meta[["asin", "category", "description"]][(df_meta.description.str[0].str.len() > 50) & (df_meta.description.str[0].str.len() < 500)].sample(1000)
dftmp.description = dftmp.description.map(lambda v: v[0])
dftmp = dftmp.set_index("asin")
# displayAll(dftmp)
display(dftmp)
# df_meta.sample(5).description.str[0]

Unnamed: 0_level_0,category,description
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
B00007M3M1,"[Magazine Subscriptions, Professional & Educat...",Published for horsemen who are interested in t...
B005DNA3LU,"[Magazine Subscriptions, Science, History & Na...",America's #1 bird and garden magazine brings b...
B00REZ6242,"[Magazine Subscriptions, News & Political Comm...",THE WEEK provides its readers with a comprehen...
B00PC76P6Y,"[Magazine Subscriptions, Cooking, Food & Wine,...",Dr. Oz The Good Life is a new lifestyle magazi...
B00005NSJZ,"[Magazine Subscriptions, Sports, Recreation & ...","Dedicated to America's West Coast boating, cru..."
...,...,...
B001DTVM3G,"[Magazine Subscriptions, Travel, City & Region...",This is a leading Detroit business publication...
B00006KOUM,"[Magazine Subscriptions, Literary, Sci-Fi & My...","Mystery Scene offers lively, expert coverage o..."
B00007B2D2,"[Magazine Subscriptions, Literary, Sci-Fi & My...",True stories about women. Also information on ...
B00005N7QN,"[Magazine Subscriptions, Fashion & Style, Women]","Harper's BAZAAR, the fashion authority, brings..."


In [134]:
#  dftmp.index
print(df.shape)
dftmp2 = df[(df.asin.isin(dftmp.index) & (df.reviewText.str.len() > 50))]
dftmp2 = dftmp2.set_index("asin")
dftmp2 = dftmp2[["overall", "summary", "reviewText"]]
dftmp2

(2375, 12)


Unnamed: 0_level_0,overall,summary,reviewText
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th..."
B00005N7PS,5.0,The best mature Men's magazine.,"There's nothing to say, but if you want a REAL..."
B00005N7PS,1.0,THE Magazine for the Self-Centered Male,If you're the kind of man who looks at himself...
B00005N7PS,1.0,Nothing to it. Just an advertisement. Little a...,Nothing to it. Just an advertisement. Little...
B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...
...,...,...,...
B000IJ7RQ8,5.0,Favorite magazine for entertainment coverage,This is a gift every year for my mom's birthda...
B000IJ7RQ8,2.0,"This used to be my favorite magazine, but it h...","This used to be my favorite magazine, but it h..."
B005DNB4SG,3.0,Three Stars,not my favorite magazine. Will not reorder nex...
B00X6LREH2,5.0,Ok magazine. a lot of the reciepes use ingredi...,Ok magazine.a lot of the reciepes use ingredie...


In [149]:
# display(dftmp2[["overall", "summary", "reviewText"]].sample(10))
print(dftmp.shape)
print(dftmp2.shape)
# dftmp2[["overall", "summary", "reviewText"]].join(dftmp)
# dftmp3 = pd.merge(dftmp, dftmp2, left_index=True, right_index=True)
dftmp3 = dftmp2.join(dftmp)
print(dftmp3.shape)
dftmp3.head()

(1000, 2)
(808, 3)
(898, 5)


Unnamed: 0_level_0,overall,summary,reviewText,category,description
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","[Magazine Subscriptions, Technology, Computers...",MAXIMUM PC is the ultimate upgrade for the sav...
B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","[Magazine Subscriptions, Technology, Computers...",MAXIMUM PC is the ultimate upgrade for the sav...
B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"[Magazine Subscriptions, Technology, Computers...",MAXIMUM PC is the ultimate upgrade for the sav...
B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"[Magazine Subscriptions, Technology, Computers...",MAXIMUM PC is the ultimate upgrade for the sav...
B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"[Magazine Subscriptions, Technology, Computers...",MAXIMUM PC is the ultimate upgrade for the sav...


In [151]:
dftmp3.to_csv("../data/data/amazon_reviews_v2/processed_magazine-subscriptions.tsv", sep="\t")

In [154]:
displayAll(dftmp3.sample(10))

Unnamed: 0_level_0,overall,summary,reviewText,category,description
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B0001MS2D4,5.0,COUNTRY COOL!!!,"I had bought Country Sampler off the newsstand for years and now I can LOVE,LOVE it on my Kindle. I like looking at the new items created and the lovely articles on rooms and homes. I have got some great ideas though out the years. Try it I know you will love it!!!","[Magazine Subscriptions, Professional & Educational Journals, Professional & Trade, Arts, Decorative Arts]",Our unbeatable combination of country-lifestyle articles and a complete catalog of decorating products provide all the tips and tools you need to make your house a country home!
B00007BK3L,5.0,Time,Awesome magazine to learn lots about what's going on in America and other countries. I'm glad I purchased as a gift. They love it!!,"[Magazine Subscriptions, News & Political Commentary, Ideas & Commentary]","TIME reveals what today's headlines mean to you and your family -- from politics, to science, to human achievement, arts, business, and society."
B00005UMOT,5.0,Favorite.,Had a subscription for years. It's one of my favorites. Full of inspiring photos.,"[Magazine Subscriptions, Home & Garden, Design & Decoration]","One of America?s most beautiful magazines featuring tours of exquisite homes and gardens, decorating and renovation ideas, entertaining and collecting."
B00007AX0N,3.0,It used to be better,"I got ESPN the Mag in high school and it was great. Now that they try to put a theme to every issue, it gets kind of annoying. I probably won't be renewing my subscription. However, it is good bathroom or break room material.","[Magazine Subscriptions, Sports, Recreation &amp; Outdoors, Sports &amp; Leisure]","The Magazine for the NEXT generation of sports fans with emphasis on the personality, lifestyle &amp; off--the-field activities of today's newsworthy &amp; up-and-coming athletes. All delivered with insights, humor, cutting edge design and in-your-face photography."
B00005N7QG,5.0,Good magazine,"I don't take this anymore, I've cut way back on my selections to just the decorator magazines. Nothing wrong with Good Housekeeping, just don't have time to read all these magazines so I just take a few now.","[Magazine Subscriptions, Cooking, Food & Wine, Recipes & Techniques]","Good Housekeeping magazine, together with the Good Housekeeping Institute and the Good Housekeeping Seal, is an American icon of consumer protection and quality assurance. Every issue delivers a unique mix of independent investigation and trusted reporting, along with inspirational and personal stories. The magazine's rich tradition embodies a commitment to the modern home and to a woman's quality of life."
B00005N7PT,1.0,Used to be great. Now it's just hearsay pop science crap.,"It's just disappointing where this magazine has gone. Last month, I had the ""pleasure"" of reading a bombastic self-description of how a surgeon saved a patient's airway from his anesthesiologist. The article was so narcissistic and the story was so improbable for anybody who works in an operating room, that I'm 100% sure it was never fact-checked, not to speak about running it by an independent expert.\n\nSince then, I don't trust what I read there, and the magazine is too glossy for toilet paper, so I'll just cancel the subscription.","[Magazine Subscriptions, Science, History &amp; Nature, Nature &amp; Ecology]","Discover magazine provides a comprehensive look into the latest news in the world of science and the effect it has on our everyday lives. Sure to satisfy the curiosity of your most inquisitive customers, Discover is a must-have magazine for your waiting room."
B00005N7PS,4.0,In the Details.,"I enjoyed Details Magazine, similar to GQ but skewed to a younger audience.","[Magazine Subscriptions, Fashion & Style, Men]","Details sets the trends that get people talking...breaking the stories that keep you in the know, ahead of the crowd, and at the forefront of the hottest fashion, celebrities, movies, music, ideas, technology and issues of the day long before everyone else!"
B00005NIPH,4.0,Best for the hobbyist or starting writer,"If you are making your living by writing you probably won't use this magazine as much as a hobbyist or a beginning writer.\n\nThis magazine makes a nice gift subscription for friends or family who enjoy writing for a hobby. Each issue comes with 3-4 large features or how-to articles. These articles focus on different aspects of the writing life, including development, marketing and interviews with published authors, agents and editors of publishing houses.\n\nAs other reviewers mentioned, most of the writing topics in this magazine and similar resources can be found online. That doesn't mean you should not buy a subscription as a gift.\n\nThe magazine is perfect for an older person who is not comfortable with computers or someone who doesn't have access to a high-speed Internet service. The monthly assortment of articles helps to encourage and stimulate a beginning writer. There are gems for everyone that can be found in a one-year subscription to Writer's Digest.","[Magazine Subscriptions, Professional & Educational Journals, Professional & Trade, Arts]","For the writer at heart. Each issue focuses on the craft of writing, the tools and information for writing, and the markets for writing. Features examine how to write and sell magazine and newspaper articles, books, plays, poetry and scripts."
B00007AWXX,5.0,"Great price, great magazine.","I was getting very tired of my old ""Glamour"" magazine so I figured for this price why not try a new fashion magazine. It is great. I am enjoying not seeing all the same old articles and reading something new.","[Magazine Subscriptions, Fashion & Style, Women]","Elle is the world's largest fashion magazine edited for woman with a style - and a mind - of her own. Features include lifestyle, culture, entertainment, politics, music, theater and the arts."
B00007IJZT,4.0,Great Fashion Police,I previously received this magazine on a trial offer for only 4 months. I knew I wouldn't want to reorder - WRONG. I found myself missing the makeup secrets and fashion ideas. Great little magazine.,"[Magazine Subscriptions, Fashion & Style, Women]","InStyle celebrates the private side of public faces, revealing the personal style choices of the world's most fascinating people. An insider's guide to trends in beauty, fashion, home entertaining and charities, InStyle is a trusted source for inspiration and ideas."


# Adding meta-features

In [7]:
df = pd.read_csv("../data/data/amazon_reviews_v2/processed_magazine-subscriptions.tsv", sep="\t")
print(df.shape)
df.head()

(898, 6)


Unnamed: 0,asin,overall,summary,reviewText,category,description
0,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...
1,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...
2,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...
3,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...
4,B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...


## Extract sentiment

In [173]:
import requests

API_URL = "https://api-inference.huggingface.co/models/lxyuan/distilbert-base-multilingual-cased-sentiments-student"
headers = {"Authorization": "Bearer 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"} #owlmx
# headers = {"Authorization": "Bearer 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"} #idsia-nlp

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	

In [176]:
hf_infer(query, exs[0])

[[{'label': 'neutral', 'score': 0.3533730208873749},
  {'label': 'negative', 'score': 0.3360978066921234},
  {'label': 'positive', 'score': 0.3105291426181793}]]

In [135]:
rs_df = hf_infer_batch(df, query, output_col_name="sentiment", max_seq=400, chunks=180)
print(f"{len(rs_df)}")

 22%|██▏       | 39/180 [00:23<01:25,  1.64it/s]

Problem--> Length of values (2) does not match length of index (5)


 29%|██▉       | 53/180 [00:31<01:06,  1.91it/s]

Problem--> Length of values (2) does not match length of index (5)


 34%|███▍      | 62/180 [00:36<01:04,  1.82it/s]

Problem--> Length of values (2) does not match length of index (5)


 37%|███▋      | 66/180 [00:39<01:09,  1.64it/s]

Problem--> Length of values (2) does not match length of index (5)


 43%|████▎     | 78/180 [00:47<01:07,  1.51it/s]

Problem--> Length of values (2) does not match length of index (5)


 46%|████▌     | 82/180 [00:50<01:03,  1.55it/s]

Problem--> Length of values (2) does not match length of index (5)


 47%|████▋     | 84/180 [00:51<01:10,  1.36it/s]

Problem--> Length of values (2) does not match length of index (5)


 48%|████▊     | 87/180 [00:54<01:07,  1.37it/s]

Problem--> Length of values (2) does not match length of index (5)


 51%|█████     | 92/180 [00:58<01:03,  1.39it/s]

Problem--> Length of values (2) does not match length of index (5)


 64%|██████▍   | 116/180 [01:13<00:40,  1.58it/s]

Problem--> Length of values (1) does not match length of index (5)


 65%|██████▌   | 117/180 [01:14<00:44,  1.43it/s]

Problem--> Length of values (1) does not match length of index (5)


 66%|██████▌   | 118/180 [01:15<00:46,  1.34it/s]

Problem--> Length of values (1) does not match length of index (5)


 66%|██████▌   | 119/180 [01:16<00:46,  1.31it/s]

Problem--> Length of values (1) does not match length of index (5)


 67%|██████▋   | 120/180 [01:17<00:47,  1.26it/s]

Problem--> Length of values (1) does not match length of index (5)


 67%|██████▋   | 121/180 [01:18<00:47,  1.23it/s]

Problem--> Length of values (1) does not match length of index (5)


 68%|██████▊   | 122/180 [01:18<00:47,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 68%|██████▊   | 123/180 [01:19<00:47,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 69%|██████▉   | 124/180 [01:20<00:46,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 69%|██████▉   | 125/180 [01:21<00:46,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 70%|███████   | 126/180 [01:22<00:44,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 71%|███████   | 127/180 [01:23<00:44,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 71%|███████   | 128/180 [01:23<00:43,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 72%|███████▏  | 129/180 [01:24<00:42,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 72%|███████▏  | 130/180 [01:25<00:41,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 73%|███████▎  | 131/180 [01:26<00:40,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 73%|███████▎  | 132/180 [01:27<00:40,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 74%|███████▍  | 133/180 [01:28<00:39,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 74%|███████▍  | 134/180 [01:28<00:38,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 75%|███████▌  | 135/180 [01:29<00:37,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 76%|███████▌  | 136/180 [01:30<00:36,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 76%|███████▌  | 137/180 [01:31<00:36,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 77%|███████▋  | 138/180 [01:32<00:35,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 77%|███████▋  | 139/180 [01:33<00:34,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 78%|███████▊  | 140/180 [01:33<00:33,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 78%|███████▊  | 141/180 [01:34<00:32,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 79%|███████▉  | 142/180 [01:35<00:31,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 79%|███████▉  | 143/180 [01:36<00:30,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 80%|████████  | 144/180 [01:37<00:29,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 81%|████████  | 145/180 [01:38<00:28,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 81%|████████  | 146/180 [01:38<00:27,  1.22it/s]

Problem--> Length of values (1) does not match length of index (5)


 82%|████████▏ | 147/180 [01:39<00:27,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 82%|████████▏ | 148/180 [01:40<00:26,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 83%|████████▎ | 149/180 [01:41<00:25,  1.22it/s]

Problem--> Length of values (1) does not match length of index (5)


 83%|████████▎ | 150/180 [01:42<00:24,  1.21it/s]

Problem--> Length of values (1) does not match length of index (5)


 84%|████████▍ | 151/180 [01:43<00:24,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 84%|████████▍ | 152/180 [01:43<00:23,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 85%|████████▌ | 153/180 [01:44<00:22,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 86%|████████▌ | 154/180 [01:45<00:21,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 86%|████████▌ | 155/180 [01:46<00:20,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 87%|████████▋ | 156/180 [01:47<00:20,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 87%|████████▋ | 157/180 [01:48<00:19,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 88%|████████▊ | 158/180 [01:48<00:18,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 88%|████████▊ | 159/180 [01:49<00:17,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 89%|████████▉ | 160/180 [01:50<00:16,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 89%|████████▉ | 161/180 [01:51<00:15,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 90%|█████████ | 162/180 [01:52<00:14,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 91%|█████████ | 163/180 [01:53<00:14,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 91%|█████████ | 164/180 [01:53<00:13,  1.18it/s]

Problem--> Length of values (1) does not match length of index (5)


 92%|█████████▏| 165/180 [01:54<00:12,  1.18it/s]

Problem--> Length of values (1) does not match length of index (5)


 92%|█████████▏| 166/180 [01:55<00:11,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 93%|█████████▎| 167/180 [01:56<00:11,  1.18it/s]

Problem--> Length of values (1) does not match length of index (5)


 93%|█████████▎| 168/180 [01:57<00:10,  1.16it/s]

Problem--> Length of values (1) does not match length of index (5)


 94%|█████████▍| 169/180 [01:58<00:09,  1.16it/s]

Problem--> Length of values (1) does not match length of index (5)


 94%|█████████▍| 170/180 [01:59<00:08,  1.16it/s]

Problem--> Length of values (1) does not match length of index (5)


 95%|█████████▌| 171/180 [01:59<00:07,  1.17it/s]

Problem--> Length of values (1) does not match length of index (5)


 96%|█████████▌| 172/180 [02:00<00:06,  1.18it/s]

Problem--> Length of values (1) does not match length of index (5)


 96%|█████████▌| 173/180 [02:01<00:05,  1.20it/s]

Problem--> Length of values (1) does not match length of index (5)


 97%|█████████▋| 174/180 [02:02<00:05,  1.19it/s]

Problem--> Length of values (1) does not match length of index (5)


 97%|█████████▋| 175/180 [02:03<00:04,  1.18it/s]

Problem--> Length of values (1) does not match length of index (5)


 98%|█████████▊| 176/180 [02:04<00:03,  1.17it/s]

Problem--> Length of values (1) does not match length of index (5)


 98%|█████████▊| 177/180 [02:05<00:02,  1.16it/s]

Problem--> Length of values (1) does not match length of index (5)


 99%|█████████▉| 178/180 [02:05<00:01,  1.15it/s]

Problem--> Length of values (1) does not match length of index (4)


 99%|█████████▉| 179/180 [02:06<00:00,  1.15it/s]

Problem--> Length of values (1) does not match length of index (4)


100%|██████████| 180/180 [02:07<00:00,  1.41it/s]

535 180





In [138]:
df = pd.concat(rs_df, axis=0)
df

Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment
0,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425..."
1,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425..."
2,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866..."
3,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866..."
4,B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.792497694492..."
...,...,...,...,...,...,...,...
893,B01CF3ECNK,3.0,"I hate to admit it, but...","Against my better judgment, I picked up an iss...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,
894,B01CF3ECNK,3.0,Everyday.....,"A fun magazine to read through but for me, the...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,
895,B01CF3ECNK,5.0,Really well put together,I first saw this magazine at my dentist's offi...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,
896,B01CF3ECNK,5.0,Love this magazine,This magazine is fun and upbeat. The pictures ...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,


In [197]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    if not isinstance(row['sentiment'], list) and pd.isna(row['sentiment']):
        # df.at[index, 'salary'] = 200
        # print(f"{index}, {row['reviewText']}")
        df.at[index, 'sentiment'] = hf_infer(query, row["reviewText"], max_seq=400)
        # sleep(0.5)
        pass

100%|██████████| 898/898 [02:21<00:00,  6.37it/s] 


In [199]:
df[df.sentiment.isna()]

Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment


In [7]:
# df.to_csv("../data/data/amazon_reviews_v2/processed_magazine-subscriptions_sentiment.tsv", sep="\t")
df = pd.read_csv("../data/data/amazon_reviews_v2/processed_magazine-subscriptions_sentiment.tsv", sep="\t")
print(df.shape)
df.head()

(898, 10)


Unnamed: 0.1,Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp,reviewText_tmp_len
0,0,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32
1,1,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32
2,2,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136
3,3,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136
4,4,B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.792497694492...",In my mind BEST PC Magazine for upgrades / har...,17


In [243]:
tmp = df.sentiment.map(lambda v: len(v))
tmp

0      3
1      3
2      3
3      3
4      3
      ..
893    3
894    3
895    3
896    3
897    2
Name: sentiment, Length: 898, dtype: int64

In [282]:
# df[(df.sentiment.str.len() < 3)]
# df[[not isinstance(v, list) for v in df.sentiment]]
# fix sentiments assigned with nested array
# df[[isinstance(v, list) and len(v) <3 for v in df.sentiment]]
for index, row in df[[isinstance(v, list) and len(v) <3 for v in df.sentiment]].iterrows():
    df.at[index, "sentiment"] = row["sentiment"][0]


In [304]:
df[(df.sentiment.str.len() < 3)]
# df[[not isinstance(v, list) for v in df.sentiment]]


Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp,reviewText_tmp_len


In [249]:
df["reviewText_tmp"] = df.reviewText.map(lambda v: ' ' .join(v.split()[:400]))
df

Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp
0,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha..."
1,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha..."
2,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...
3,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...
4,B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.792497694492...",In my mind BEST PC Magazine for upgrades / har...
...,...,...,...,...,...,...,...,...
893,B01CF3ECNK,3.0,"I hate to admit it, but...","Against my better judgment, I picked up an iss...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'negative', 'score': 0.576227664947...","Against my better judgment, I picked up an iss..."
894,B01CF3ECNK,3.0,Everyday.....,"A fun magazine to read through but for me, the...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'neutral', 'score': 0.4259364902973...","A fun magazine to read through but for me, the..."
895,B01CF3ECNK,5.0,Really well put together,I first saw this magazine at my dentist's offi...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'positive', 'score': 0.698858737945...",I first saw this magazine at my dentist's offi...
896,B01CF3ECNK,5.0,Love this magazine,This magazine is fun and upbeat. The pictures ...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'positive', 'score': 0.736666321754...",This magazine is fun and upbeat. The pictures ...


In [251]:
df["reviewText_tmp_len"] = [len(t.split()) for t in df.reviewText_tmp]
df

Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp,reviewText_tmp_len
0,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32
1,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32
2,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136
3,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136
4,B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.792497694492...",In my mind BEST PC Magazine for upgrades / har...,17
...,...,...,...,...,...,...,...,...,...
893,B01CF3ECNK,3.0,"I hate to admit it, but...","Against my better judgment, I picked up an iss...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'negative', 'score': 0.576227664947...","Against my better judgment, I picked up an iss...",85
894,B01CF3ECNK,3.0,Everyday.....,"A fun magazine to read through but for me, the...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'neutral', 'score': 0.4259364902973...","A fun magazine to read through but for me, the...",38
895,B01CF3ECNK,5.0,Really well put together,I first saw this magazine at my dentist's offi...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'positive', 'score': 0.698858737945...",I first saw this magazine at my dentist's offi...,90
896,B01CF3ECNK,5.0,Love this magazine,This magazine is fun and upbeat. The pictures ...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'positive', 'score': 0.736666321754...",This magazine is fun and upbeat. The pictures ...,74


In [281]:
df[(df.sentiment.str.len() < 3)]

Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp,reviewText_tmp_len
464,B00005NIPP,1.0,Don't do it! -- Pick one of these magazines in...,Maxim rode the men's magazine gold rush in the...,"['Magazine Subscriptions', 'Fashion & Style', ...",Maxim is the essential guide for today's activ...,"[[{'label': 'negative', 'score': 0.62371635437...",Maxim rode the men's magazine gold rush in the...,232


In [280]:
for index, row in df[[len(v) <3 for v in df.sentiment]].iterrows():
    # df.at[index, "sentiment"] = row["sentiment"][0]
    # print(index)
    df.at[index, 'sentiment'] = hf_infer(query, row["reviewText_tmp"], max_seq=250)

In [277]:
wrap_print(df.iloc[464].reviewText[:900])

("Maxim rode the men's magazine gold rush in the late 1990s, leading Stuff, FHM and a host of other PG-13 fare that filled the convenience store racks as "
 'Playboy, Penthouse and the hard cores were either dropped completely or were tucked behind the clerk hidden behind individual black plastic wrappers.\n'
 '\n'
 "Maxim is the last Laddie still in print in the U.S., having witnessed the print burials for FHM (1996 to 2007) and Stuff (1998 to 2007). Maxim's "
 'putrefaction is evident to its subscribers, hence the heavily discounted subscriber rates available today. If you were unfortunate enough to receive a gift '
 "subscription to Maxim or, worse yet, bought one for yourself, you can no doubt bear witness to the decay. The magazine simply isn't interesting or "
 'compelling to read. You could find more of everything - women, entertainment, food and fashion - in a 10-minute Google search.\n'
 '\n'
 'Ask yourself, why are')
None


In [279]:
df.at[464,"reviewText_tmp"] = df.iloc[464].reviewText[:900]

In [301]:
# from transformers import DistilBertTokenizerFast

In [302]:
# from huggingface_hub import inference_api
# client = inference_api.InferenceApi("lxyuan/distilbert-base-multilingual-cased-sentiments-student", task="text-classification", token="...")

In [303]:
# client(df.iloc[464].reviewText, params={"truncation": True})

## Extract keywords

In [14]:
API_URL = "https://api-inference.huggingface.co/models/Voicelab/vlt5-base-keywords"
headers = {"Authorization": "Bearer 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"} #owlmx
# headers = {"Authorization": "Bearer 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"} #idsia-nlp

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	

In [309]:
hf_infer(query, df.iloc[464].reviewText)

[{'generated_text': 'Maxim, entertainment, fashion, fashion, fashion'}]

In [310]:
rs_df = hf_infer_batch(df, query, output_col_name="keywords", max_seq=400, chunks=90)
print(f"{len(rs_df)}")

 61%|██████    | 55/90 [07:19<03:24,  5.84s/it]

Problem--> Length of values (2) does not match length of index (10)


 62%|██████▏   | 56/90 [07:19<02:22,  4.20s/it]

Problem--> Length of values (2) does not match length of index (10)


 63%|██████▎   | 57/90 [07:20<01:40,  3.05s/it]

Problem--> Length of values (2) does not match length of index (10)


 64%|██████▍   | 58/90 [07:20<01:11,  2.25s/it]

Problem--> Length of values (2) does not match length of index (10)


 66%|██████▌   | 59/90 [07:21<00:52,  1.69s/it]

Problem--> Length of values (2) does not match length of index (10)


 67%|██████▋   | 60/90 [07:21<00:38,  1.28s/it]

Problem--> Length of values (2) does not match length of index (10)


 68%|██████▊   | 61/90 [07:21<00:29,  1.01s/it]

Problem--> Length of values (2) does not match length of index (10)


 69%|██████▉   | 62/90 [07:22<00:22,  1.23it/s]

Problem--> Length of values (2) does not match length of index (10)


 70%|███████   | 63/90 [07:22<00:18,  1.50it/s]

Problem--> Length of values (2) does not match length of index (10)


 71%|███████   | 64/90 [07:22<00:14,  1.74it/s]

Problem--> Length of values (2) does not match length of index (10)


 72%|███████▏  | 65/90 [07:23<00:12,  1.99it/s]

Problem--> Length of values (2) does not match length of index (10)


 73%|███████▎  | 66/90 [07:23<00:11,  2.17it/s]

Problem--> Length of values (2) does not match length of index (10)


 74%|███████▍  | 67/90 [07:23<00:09,  2.39it/s]

Problem--> Length of values (2) does not match length of index (10)


 76%|███████▌  | 68/90 [07:24<00:08,  2.52it/s]

Problem--> Length of values (2) does not match length of index (10)


 77%|███████▋  | 69/90 [07:24<00:08,  2.60it/s]

Problem--> Length of values (2) does not match length of index (10)


 78%|███████▊  | 70/90 [07:24<00:07,  2.71it/s]

Problem--> Length of values (2) does not match length of index (10)


 79%|███████▉  | 71/90 [07:25<00:06,  2.73it/s]

Problem--> Length of values (2) does not match length of index (10)


 80%|████████  | 72/90 [07:25<00:06,  2.71it/s]

Problem--> Length of values (2) does not match length of index (10)


 81%|████████  | 73/90 [07:25<00:06,  2.70it/s]

Problem--> Length of values (2) does not match length of index (10)


 82%|████████▏ | 74/90 [07:26<00:05,  2.73it/s]

Problem--> Length of values (2) does not match length of index (10)


 83%|████████▎ | 75/90 [07:26<00:05,  2.73it/s]

Problem--> Length of values (2) does not match length of index (10)


 84%|████████▍ | 76/90 [07:27<00:04,  2.80it/s]

Problem--> Length of values (2) does not match length of index (10)


 86%|████████▌ | 77/90 [07:27<00:04,  2.89it/s]

Problem--> Length of values (2) does not match length of index (10)


 87%|████████▋ | 78/90 [07:27<00:04,  2.84it/s]

Problem--> Length of values (2) does not match length of index (10)


 88%|████████▊ | 79/90 [07:28<00:03,  2.81it/s]

Problem--> Length of values (2) does not match length of index (10)


 89%|████████▉ | 80/90 [07:28<00:03,  2.86it/s]

Problem--> Length of values (2) does not match length of index (10)


 90%|█████████ | 81/90 [07:28<00:03,  2.79it/s]

Problem--> Length of values (2) does not match length of index (10)


 91%|█████████ | 82/90 [07:29<00:02,  2.86it/s]

Problem--> Length of values (2) does not match length of index (10)


 92%|█████████▏| 83/90 [07:29<00:02,  2.84it/s]

Problem--> Length of values (2) does not match length of index (10)


 93%|█████████▎| 84/90 [07:29<00:02,  2.81it/s]

Problem--> Length of values (2) does not match length of index (10)


 94%|█████████▍| 85/90 [07:30<00:01,  2.84it/s]

Problem--> Length of values (2) does not match length of index (10)


 96%|█████████▌| 86/90 [07:30<00:01,  2.91it/s]

Problem--> Length of values (2) does not match length of index (10)


 97%|█████████▋| 87/90 [07:30<00:01,  2.94it/s]

Problem--> Length of values (2) does not match length of index (10)


 98%|█████████▊| 88/90 [07:31<00:00,  2.97it/s]

Problem--> Length of values (2) does not match length of index (10)


 99%|█████████▉| 89/90 [07:31<00:00,  2.88it/s]

Problem--> Length of values (2) does not match length of index (9)


100%|██████████| 90/90 [07:31<00:00,  5.02s/it]

Problem--> Length of values (2) does not match length of index (9)
90





In [313]:
df = pd.concat(rs_df, axis=0)
df

Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp,reviewText_tmp_len,keywords
0,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32,"{'generated_text': 'MaximumPC, computer perfom..."
1,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32,"{'generated_text': 'MaximumPC, computer perfom..."
2,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136,"{'generated_text': 'AES, computer field, compu..."
3,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136,"{'generated_text': 'AES, computer field, compu..."
4,B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.792497694492...",In my mind BEST PC Magazine for upgrades / har...,17,"{'generated_text': 'BEST PC Magazine, hardware..."
...,...,...,...,...,...,...,...,...,...,...
893,B01CF3ECNK,3.0,"I hate to admit it, but...","Against my better judgment, I picked up an iss...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'negative', 'score': 0.576227664947...","Against my better judgment, I picked up an iss...",85,
894,B01CF3ECNK,3.0,Everyday.....,"A fun magazine to read through but for me, the...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'neutral', 'score': 0.4259364902973...","A fun magazine to read through but for me, the...",38,
895,B01CF3ECNK,5.0,Really well put together,I first saw this magazine at my dentist's offi...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'positive', 'score': 0.698858737945...",I first saw this magazine at my dentist's offi...,90,
896,B01CF3ECNK,5.0,Love this magazine,This magazine is fun and upbeat. The pictures ...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'positive', 'score': 0.736666321754...",This magazine is fun and upbeat. The pictures ...,74,


In [122]:
df.to_csv("../data/data/amazon_reviews_v2/processed_magazine-subscriptions_sentiment-keywords.tsv", sep="\t")
df = pd.read_csv("../data/data/amazon_reviews_v2/processed_magazine-subscriptions_sentiment-keywords.tsv", sep="\t")
print(df.shape)
df.head()

(898, 14)


Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp,reviewText_tmp_len,keywords
0,0,0,0,0,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32,"MaximumPC, computer perfomance, computer perfo..."
1,1,1,1,1,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32,"MaximumPC, computer perfomance, computer perfo..."
2,2,2,2,2,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136,"AES, computer field, computer field, related t..."
3,3,3,3,3,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136,"AES, computer field, computer field, related t..."
4,4,4,4,4,B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.792497694492...",In my mind BEST PC Magazine for upgrades / har...,17,"BEST PC Magazine, hardware, software"


In [81]:
from ast import literal_eval

In [115]:
# df.keywords = df.keywords.map(literal_eval)
for index, row in df.iterrows():
    if not type(row["keywords"]) in [list, dict] and pd.isna(row["keywords"]):
        print('****AHHHHHH')    
        print(f'{row["keywords"]} -> {type(row["keywords"])}')
    else:
        df.at[index, "keywords"]  = literal_eval(row["keywords"]) 
        # print("x")


In [116]:
df[df.keywords.isna()]

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp,reviewText_tmp_len,keywords


In [113]:
for index, row in df.iterrows():
    # print(f'{row["keywords"]} -> {type(row["keywords"])}')
    if isinstance(row["keywords"], dict) and 'error' in row["keywords"]:
        print('****')
        print(f'{row["keywords"]} -> {type(row["keywords"])}')
        # df.at[index, "keywords"] = None
    elif isinstance(row["keywords"], list):
        print('****XXX')
        print(f'{row["keywords"]} -> {type(row["keywords"])}')
        # df.at[index, "keywords"] = row["keywords"][0]
    elif not type(row["keywords"]) in [list, dict] and pd.isna(row["keywords"]):
        print('****AHHHHHH')    
        print(f'{row["keywords"]} -> {type(row["keywords"])}')
    
    # print(index)

In [104]:
# try the missing ones
i = 0
for index, row in df[[(not type(v) in [list, dict]) and pd.isna(v) for v in df.keywords]].iterrows():
    # df.at[index, "sentiment"] = row["sentiment"][0]
    i+=1
    # print(f"{index} -> {i} => \n\t{row['reviewText']}\n\n")
    print(f"{index} -> {i}")

    try:
        df.at[index, 'keywords'] = hf_infer(query, row["reviewText"])[0]
        # print(index)
    except Exception as err:
        print(f"Error --> {err}")
        pass
    # if i>2:
    #     break

540 -> 1
541 -> 2
542 -> 3


In [120]:
df["keywords"] = df["keywords"].map(lambda v: v.get('generated_text', ""))

In [121]:
# for index, row in df.iloc[0:30].iterrows():
for index, row in df.iloc[540:550].iterrows():
    display(f'{index} - {row["keywords"]}')
    print(type(row["keywords"]))

'540 - Winespectator, Winespectator the best, Winespecta'

<class 'str'>


'541 - digital, magazine, magazine, magazine, magazine, ma'

<class 'str'>


'542 - Wine Spectator, wine region, wine spectator'

<class 'str'>


'543 - Wine Spectator, Wine spectator, Wine spectator, W'

<class 'str'>


'544 - Wine coverage, Wine magazine, Wine coverage'

<class 'str'>


'545 - beaverage, beaverage, beaverage, beaverage'

<class 'str'>


'546 - Woman’s World, horoscopes, horoscopes, horo'

<class 'str'>


'547 - grocery store, information piece, information piece,'

<class 'str'>


'548 - magizine, magizine, magizine, magizine, magizine'

<class 'str'>


'549 - information, nutrition, nutrition information, young bri'

<class 'str'>


# Transform to prompts in the Dolly Format

In [125]:
# From Joseph generated Dataset (instruction, context, model, category, response )
# From HF dataset (instruction,  category, input, output)

In [9]:
df = pd.read_csv("../data/data/amazon_reviews_v2/processed_magazine-subscriptions_sentiment-keywords.tsv", sep="\t")
print(df.shape)
df

(898, 14)


Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,overall,summary,reviewText,category,description,sentiment,reviewText_tmp,reviewText_tmp_len,keywords
0,0,0,0,0,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32,"MaximumPC, computer perfomance, computer perfo..."
1,1,1,1,1,B00005N7P0,4.0,Cheapskates guide,"I'm old, and so is my computer. Any advice th...","['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.922463059425...","I'm old, and so is my computer. Any advice tha...",32,"MaximumPC, computer perfomance, computer perfo..."
2,2,2,2,2,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136,"AES, computer field, computer field, related t..."
3,3,3,3,3,B00005N7P0,5.0,Excellent Computer Magazine,When PC Magazine ceased publication of their p...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.596512436866...",When PC Magazine ceased publication of their p...,136,"AES, computer field, computer field, related t..."
4,4,4,4,4,B00005N7P0,5.0,Best PC Magazine,In my mind BEST PC Magazine for upgrades / har...,"['Magazine Subscriptions', 'Technology', 'Comp...",MAXIMUM PC is the ultimate upgrade for the sav...,"[{'label': 'positive', 'score': 0.792497694492...",In my mind BEST PC Magazine for upgrades / har...,17,"BEST PC Magazine, hardware, software"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,893,893,893,893,B01CF3ECNK,3.0,"I hate to admit it, but...","Against my better judgment, I picked up an iss...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'negative', 'score': 0.576227664947...","Against my better judgment, I picked up an iss...",85,"Rachael Ray, Rachael Ray, food, food, f"
894,894,894,894,894,B01CF3ECNK,3.0,Everyday.....,"A fun magazine to read through but for me, the...","['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'neutral', 'score': 0.4259364902973...","A fun magazine to read through but for me, the...",38,"fun magazine, fun magazine, fun magazine,"
895,895,895,895,895,B01CF3ECNK,5.0,Really well put together,I first saw this magazine at my dentist's offi...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'positive', 'score': 0.698858737945...",I first saw this magazine at my dentist's offi...,90,"Martha Stuart style, cooking, time better, tim..."
896,896,896,896,896,B01CF3ECNK,5.0,Love this magazine,This magazine is fun and upbeat. The pictures ...,"['Magazine Subscriptions', 'Cooking, Food & Wi...",Rachael Ray Every Day is a food and cooking ma...,"[{'label': 'positive', 'score': 0.736666321754...",This magazine is fun and upbeat. The pictures ...,74,"magazine, pictures, pictures of the magazine"


In [18]:
from ast import literal_eval
def get_max_val(d):    
    # return max(d, key=d.get)
    return ( max(d, key=lambda x:x['score']) )

df["label"] =  df.sentiment.map(lambda v: get_max_val(literal_eval(v)).get('label'))

In [19]:
df.label.value_counts()

positive    666
negative    187
neutral      45
Name: label, dtype: int64

In [22]:
review_score_mapping = {1: "very unsatisfied", 2: "unsatisfied", 3: "moderately satisfied ", 4: "satisfied", 5: "very satisfied" }
# df.overall.unique()

In [23]:
def instructify(e:dict ):
    rs = {
        # "instruction": f"""Based in the provided product description, generate a {get_max_val(literal_eval(e['sentiment'])).get('label')} product review from a client who is {review_score_mapping.get( e['overall'])}. Consider the following keywords: {e['keywords']} """,
        "instruction": f"""Based in the provided product description, generate a {e['label']} product review from a client who is {review_score_mapping.get( e['overall'])}. Consider the following keywords: {e['keywords']} """,
        "context": f"Product description: {e['description']}",
        "category": 'review_generation',
        "response": e["reviewText"],
        "model": 'human',
    }
    
    return rs

In [24]:
for index, e in df.sample(3).iterrows():
    displayAll(instructify(e))

{'instruction': 'Based in the provided product description, generate a positive product review from a client who is very satisfied. Consider the following keywords: advancements, science, science, science, science, science, science ',
 'context': 'Product description: The \'What\'s New\'" magazine of science and technology. Covering the latest developments in cars, electronics, communications, tools, energy, aviation, science, space exploration and much more.',
 'category': 'review_generation',
 'response': 'Love this magazine.  It always has such interesting articles & short notes about new advancements in science and technology.\nThanks.',
 'model': 'human'}

{'instruction': 'Based in the provided product description, generate a positive product review from a client who is very satisfied. Consider the following keywords: Good Housekeeping, house-wifery, house-wifery ',
 'context': "Product description: Good Housekeeping magazine, together with the Good Housekeeping Institute and the Good Housekeeping Seal, is an American icon of consumer protection and quality assurance. Every issue delivers a unique mix of independent investigation and trusted reporting, along with inspirational and personal stories. The magazine's rich tradition embodies a commitment to the modern home and to a woman's quality of life.",
 'category': 'review_generation',
 'response': 'Good Housekeeping is a publication that truly gives subscribers their money\'s worth with all of the wonderful features offered. You can pick up tons of tips on beauty, nutrition, technology, and what folks in the 18th century called "the mysteries of house-wifery." There really is so much h

{'instruction': 'Based in the provided product description, generate a positive product review from a client who is very satisfied. Consider the following keywords: Rachael, recipe, recipe, recipe, recipe ',
 'context': 'Product description: Rachael Ray Every Day is a food and cooking magazine for women that features the television personality herself, Rachael Ray. This foodie magazine includes fun articles and stories on travel, parties and entertaining and thousands of recipes for you to try out at home.',
 'category': 'review_generation',
 'response': "this magazine has alot of great recipe's,great pictures of the food in the recipe's. Rachael makes very easy to do recipe's.",
 'model': 'human'}

In [25]:
rs = [instructify(e) for index, e in df.query("label =='positive'").iterrows()] + \
[instructify(e) for index, e in df.query("label =='negative'").iterrows()] + \
[instructify(e) for index, e in df.query("label =='neutral'").iterrows()]

rs


[{'instruction': 'Based in the provided product description, generate a positive product review from a client who is satisfied. Consider the following keywords: MaximumPC, computer perfomance, computer perfomance ',
  'context': 'Product description: MAXIMUM PC is the ultimate upgrade for the savvy PC owners. Every month, the magazine is packed with breaking news, tons of tips &amp; techniques, and the most in-depth reviews anywhere.',
  'category': 'review_generation',
  'response': "I'm old, and so is my computer.  Any advice that can help me maximize my computer perfomance is very welcome.  MaximumPC has some good tips on computer parts, vendors, and usefull tests",
  'model': 'human'},
 {'instruction': 'Based in the provided product description, generate a positive product review from a client who is satisfied. Consider the following keywords: MaximumPC, computer perfomance, computer perfomance ',
  'context': 'Product description: MAXIMUM PC is the ultimate upgrade for the savvy P

In [26]:
len(rs)

898

In [27]:
rs_ds = pd.DataFrame.from_dict(rs)
print(rs_ds.shape)
rs_ds

(898, 5)


Unnamed: 0,instruction,context,category,response,model
0,"Based in the provided product description, gen...",Product description: MAXIMUM PC is the ultimat...,review_generation,"I'm old, and so is my computer. Any advice th...",human
1,"Based in the provided product description, gen...",Product description: MAXIMUM PC is the ultimat...,review_generation,"I'm old, and so is my computer. Any advice th...",human
2,"Based in the provided product description, gen...",Product description: MAXIMUM PC is the ultimat...,review_generation,When PC Magazine ceased publication of their p...,human
3,"Based in the provided product description, gen...",Product description: MAXIMUM PC is the ultimat...,review_generation,When PC Magazine ceased publication of their p...,human
4,"Based in the provided product description, gen...",Product description: MAXIMUM PC is the ultimat...,review_generation,In my mind BEST PC Magazine for upgrades / har...,human
...,...,...,...,...,...
893,"Based in the provided product description, gen...",Product description: Introducing brand-new All...,review_generation,Not what I expected. Found it boring and lacki...,human
894,"Based in the provided product description, gen...",Product description: Rachael Ray Every Day is ...,review_generation,Just got my first issue. Lots of ads very lit...,human
895,"Based in the provided product description, gen...",Product description: Rachael Ray Every Day is ...,review_generation,Even I (husband with very basic cooking skills...,human
896,"Based in the provided product description, gen...",Product description: Rachael Ray Every Day is ...,review_generation,I've subscribed to this magazine for several y...,human


In [28]:
displayAll(rs_ds.sample(5))

Unnamed: 0,instruction,context,category,response,model
694,"Based in the provided product description, generate a negative product review from a client who is moderately satisfied . Consider the following keywords: leadership, leadership role, workplace","Product description: Marie Claire offers solutions for the woman whose time constraints demand one resource to respond to diverse aspects of her life. From global and cultural issues to fashion and beauty coverage, Marie Claire is for the woman of substance with an eye for style.",review_generation,"This magazine has its good and bad issues which happen every month so it's hard to grade. I do like the new ""work"" feature they added though. Especially November 2012 article regarding taking a leadership role in the workplace.",human
217,"Based in the provided product description, generate a positive product review from a client who is unsatisfied. Consider the following keywords: ads, ads, ads, ads, ads, ads, a","Product description: Redbook is the must-read magazine for today's young, married woman: an individual as passionate about her own needs as she is about those of her family. Each issue offers exciting, provocative features that address the all aspects of her lifeeverything from stylish fashion and beauty portfolios to scintillating stories on keeping her marriage fresh, to ideas on balancing home and career demands.",review_generation,Just not into magazines too many ads. But great price.,human
359,"Based in the provided product description, generate a positive product review from a client who is very satisfied. Consider the following keywords: children, children, parenting, parenting, par","Product description: The magazine that celebrates smart, sophisticated 40+ women?s interests in fashion, health, beauty, travel, and self-reinvention.",review_generation,"One of my favorite magazines -- focuses on women who are past the parenting of children and provides a variety of information -- not just style or makeup, but good, solid, articles on all aspects of life.",human
519,"Based in the provided product description, generate a positive product review from a client who is very satisfied. Consider the following keywords: Rachael Ray, Rachael Ray, Rachael Ray, Rachael Ray","Product description: Rachael Ray Every Day is a food and cooking magazine for women that features the television personality herself, Rachael Ray. This foodie magazine includes fun articles and stories on travel, parties and entertaining and thousands of recipes for you to try out at home.",review_generation,My sweetheart loves that magazine we both love Rachael Ray,human
670,"Based in the provided product description, generate a negative product review from a client who is very unsatisfied. Consider the following keywords: anesthesiology, analysis, analysisology, surge","Product description: Discover magazine provides a comprehensive look into the latest news in the world of science and the effect it has on our everyday lives. Sure to satisfy the curiosity of your most inquisitive customers, Discover is a must-have magazine for your waiting room.",review_generation,"It's just disappointing where this magazine has gone. Last month, I had the ""pleasure"" of reading a bombastic self-description of how a surgeon saved a patient's airway from his anesthesiologist. The article was so narcissistic and the story was so improbable for anybody who works in an operating room, that I'm 100% sure it was never fact-checked, not to speak about running it by an independent expert.\n\nSince then, I don't trust what I read there, and the magazine is too glossy for toilet paper, so I'll just cancel the subscription.",human


In [29]:
# Save dataset
rs_ds.to_csv(path.join("../data/data/detection_dataset/", "repurposeds_review_generation.tsv"), sep="\t")