In [1]:
# Import libraries

In [2]:
import pandas as pd
import json

import re
from nltk.corpus import stopwords

In [3]:
# Cargar el dataset

In [4]:
path_data = '../data/Musical_Instruments_5.json'

## Funciones

In [5]:
def clean_text(text, remove_stopwords = False):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'[0-9]', '', text ) # take out digits
    text = re.sub('\s+',' ',text) # more than one blank space
    text = text.strip()
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [6]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

## Load data

In [7]:
data = []
with open(path_data) as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
print(len(data))
print(data[0])

10261
{'reviewerID': 'A2IBPI20UZIR0U', 'asin': '1384719342', 'reviewerName': 'cassandra tu "Yeah, well, that\'s just like, u...', 'helpful': [0, 0], 'reviewText': "Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,", 'overall': 5.0, 'summary': 'good', 'unixReviewTime': 1393545600, 'reviewTime': '02 28, 2014'}


In [8]:
# convert list into pandas dataframe

raw_data = pd.DataFrame.from_dict(data)

In [9]:
data_reviews = raw_data[['reviewText', 'overall']].copy()

In [10]:
# quitar reviews en blanco 

filt = data_reviews['reviewText'] == ''
data_reviews = data_reviews[~filt]

In [11]:
type(data_reviews)

pandas.core.frame.DataFrame

In [12]:
# se quitan duplicados
data_review_nodup = data_reviews.drop_duplicates(subset=['reviewText', 'overall'], keep=False,inplace=False).copy()

## Se llama a las funciones

In [13]:
# Clean the summaries and texts

clean_texts = []
for text in data_review_nodup.reviewText:
    clean_texts.append(clean_text(text))
print("Texts are complete.")

Texts are complete.


In [14]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well

for i in range(5):
    print("Clean Review #",i+1)
    print(clean_texts[i])
    print()

Clean Review # 1
not much to write about here but it does exactly what it is supposed to filters out the pop sounds now my recordings are much more crisp it is one of the lowest prices pop filters on amazon so might as well buy it they honestly work the same despite their pricing

Clean Review # 2
the product does exactly as it should and is quite affordable i did not realized it was double screened until it arrived so it was even better than i had expected as an added bonus one of the screens carries a small hint of the smell of an old grape candy i used to buy so for reminiscent s sake i cannot stop putting the pop filter next to my nose and smelling it after recording dif you needed a pop filter this will work just as well as the expensive ones and it may even come with a pleasing aroma like mine did buy this product

Clean Review # 3
the primary job of this device is to block the breath that would otherwise produce a popping sound while allowing your voice to pass through with no n

In [15]:
# se incluyen los reviews procesados en data_review_nodup dataframe

data_review_nodup['cleaned_reviews'] = clean_texts

# se cambia el lugar de las columns

cleaned_reviews = data_review_nodup[['cleaned_reviews', 'overall']]

In [16]:
# Se guarda el dataframe procesado

cleaned_reviews.to_csv('cleaned_reviews.csv', index=False)