In [8]:
import tensorflow_datasets as tfds
ds = tfds.load('amazon_us_reviews/Mobile_Electronics_v1_00', split='train', shuffle_files=True)

In [9]:
df = tfds.as_dataframe(ds)

2022-05-11 15:53:33.545116: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-05-11 15:53:33.559845: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2900000000 Hz


In [11]:
df.shape

(104975, 15)

In [12]:
df["Sentiment"] = df["data/star_rating"].apply(lambda score: "positive" if score >= 3 else "negative")
df['Sentiment'] = df['Sentiment'].map({'positive':1, 'negative':0})
df['short_review'] =df['data/review_body'].str.decode("utf-8")
df = df[["short_review", "Sentiment"]]

In [13]:
df.head()

Unnamed: 0,short_review,Sentiment
0,This product was purchased to hold a monitor o...,1
1,This product works great so far. I have been u...,1
2,Does not work,0
3,This is a great wiring kit i used it to set up...,1
4,It works great so much faster than USB charger...,1


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
df = shuffle(df)
train, test = train_test_split(df, test_size=0.2)

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    # TODO: try out SnowBallStemmer
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [21]:
data_train = train['short_review'].tolist()
data_test = test['short_review'].tolist()
labels_train = train['Sentiment'].tolist()
labels_test = test['Sentiment'].tolist()

In [22]:
data_train[0], labels_train[0]

('I bought this for $250 and I really really overpaid. The face plate shorts out when I press the buttons and shuts off completely, the interface is very slow and laggy, and the display is literally not visible when there is sunlight, and the bluetoothe mic is garbage. I should not have paid more than $100 for this head unit. I only bought it for the good reviews, which was very deceiving. please do not buy this, for the love of god.',
 0)

In [23]:
data_test[0], labels_test[0]

('I ordered this for my daughter who got an IPod for Christmas this year and this case is perfect for taking the IPod out of the house and not having to worry about it getting broken.',
 1)

In [27]:
import pickle
import os

cache_dir = os.path.join("./cache", "sentiment_analysis")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data_amazon.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [28]:
train_X, test_X, train_y, test_y = preprocess_data(data_train, data_test, labels_train, labels_test)

Wrote preprocessed data to cache file: preprocessed_data_amazon.pkl


In [29]:
import numpy as np

def build_dict(data, vocab_size = 10000):
    word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    for sentence in data:
        for word in sentence:
            if word in word_count.keys():
                word_count[word] = word_count[word] + 1
            else:
                word_count[word] = 1
   
    sorted_words = {k: v for k, v in sorted(word_count.items(), key=lambda item: item[1], reverse=True)}
    sorted_words = [word for word in sorted_words.keys()]
    word_dict = {} # This is what we are building, a dictionary that translates words into integers

    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [31]:
#train_X, test_X, train_y, test_y = preprocess_data()
type(train_X), type(train_y)
train_X.extend(test_X)
train_y.extend(test_y)
len(train_X), len(train_y)

(104975, 104975)

In [32]:
import random
type(train_X), type(train_y)
temp = list(zip(train_X, train_y))
random.shuffle(temp)
train_X, train_y = zip(*temp)
# res1 and res2 come out as tuples, and so must be converted to lists.
train_X, train_y = list(train_X), list(train_y)
len(train_X), len(train_y)

(104975, 104975)

In [33]:

type(train_X), type(train_y)
print(len(train_X))
train_list = train_X[:83000]
train_labels_list = train_y[:83000]
test_list = train_X[83000:]
test_labels_list = train_y[83000:]
print(len(train_list), len(train_labels_list), len(test_list), len(test_labels_list))
print(train_list[0], test_list[0], train_labels_list[0], test_labels_list[0])

104975
83000 83000 21975 21975
['bought', 'back', 'oct', 'final', 'took', 'author', 'technician', 'week', 'program', 'technician', 'unabl', 'program', 'like', 'pictur', 'number', 'list', 'back'] ['wast', 'money', 'case', '2', 'piec', 'silicon', 'piec', 'hard', 'plastic', 'piec', 'sort', 'lock', 'silicon', 'piec', 'fit', 'loos', 'ipod', 'slightli', 'snug', 'put', 'hard', 'plastic', 'piec', 'shake', 'ipod', 'case', 'actual', 'fall', 'apart', 'ipod', 'slip', 'complet', 'defeat', 'purpos', 'get', 'protect', 'case', 'first', 'place'] 0 0


In [34]:
word_dict = build_dict(train_list)

In [35]:
data_dir = './data/pytorch' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    os.makedirs(data_dir)

In [36]:
with open(os.path.join(data_dir, 'word_dict_amazon.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

In [37]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_list)
test_X, test_X_len = convert_and_pad_data(word_dict, test_list)


In [38]:
print(train_X.shape, len(train_X_len))
print(test_X.shape, len(test_X_len))
print(train_y[0])

(83000, 500) 83000
(21975, 500) 21975
0


In [39]:
import pandas as pd
pd.concat([pd.DataFrame(train_labels_list), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1).to_csv(os.path.join(data_dir, 'train_amazon.csv'), header=False, index=False)

In [40]:
pd.concat([pd.DataFrame(test_labels_list), pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1).to_csv(os.path.join(data_dir, 'test_amazon.csv'), header=False, index=False)