## 1: Load Data

In [32]:
# Import Analysis Libraries
import numpy as np
import pandas as pd
import random
import sys
import re

# Import Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Configure Matplotlib graph production
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Import Word Processing Libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer # Used over Porter Stemmer to allow scalability with additional languages in the future
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('stopwords')

# Import Machine Learning Libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, KernelPCA
from sklearn.decomposition import LatentDirichletAllocation

# Configure Seed Generation
seed_value = random.randrange(sys.maxsize) # Use this if you want a new seed value each Load 
# Else set manual Seed Value
random.seed(seed_value)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\russe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
DF = pd.read_csv('Review_data.csv', sep=',', header=0)
DF.head()

Unnamed: 0,review_body,star_rating
0,Good luck finding a reasonably priced band rep...,1
1,No i get dark on the first week with me!! I wi...,1
2,I dont know if mine was a mistake but it clear...,1
3,The rod that holds the bracelet broke several ...,1
4,"I bought 2 watches , one watch doesnot work at...",1


In [19]:
# Check if any empty values
print('Empty Values:')
print(DF.isnull().sum(), '\n')
# Count values by review rating
print('Number of Entries per Review Rating')
print(DF.star_rating.value_counts())

Empty Values:
review_body    0
star_rating    0
dtype: int64 

Number of Entries per Review Rating
1    3000
2    3000
3    3000
4    3000
5    3000
Name: star_rating, dtype: int64


In [26]:
# Store only Text Body component for Machine Learning model
body_data = DF.loc[:, 'review_body'].tolist()

# View Some Entries
for i in range(5):
    print(body_data[i],'\n')

Good luck finding a reasonably priced band replacement. I ordered the band from the dealer who sold it to me (no one else in town could get one) and Skagen sent the wrong one.  I guess I'll try again, but not allowing anyone else to make bands for your unique watch design seems stupid. I will certainly never buy one again. 

No i get dark on the first week with me!! I will never buy this item and i had buy 5 of them 

I dont know if mine was a mistake but it clearly states aqua so im confused why mine is lime green. I hate lime green and am very irritated. This is why people hate ordering on amazon. Ive spent 100s of dollars on here latey and this one will make me not want to order.  At least its not much money. Just annoying thinking u ordered something and get something else.  Well its going in the trash... 

The rod that holds the bracelet broke several times and the company do not fix it, it is sitting on the drawer so I can come to see the Jeweler to try to fix one more time. Don'

## 2: Word Processing

In [33]:
print(len(stopwords), 'Stopwords used from library')

179 Stopwords used from library


In [34]:
# Remove any non-alphanumeric characters and punctuation - lowercase and separate words in a body of text
def remove_punct(entry):
    text = re.sub('[^a-zA-Z]',' ',entry).lower().split()
    return text

In [40]:
# Stemmatize tokenized Data
def stem(data):
    # Initialize stemmatizer and apply
    snowball = SnowballStemmer('english')
    stemmed = [snowball.stem(word) for word in data if not word in stopwords]
    stemmed = ' '.join(stemmed)
    return stemmed

In [41]:
# Preprocess data sequentially
def pre_process(target_data):
    
    # Create localized data copy 
    article = target_data.copy()
    corpus = []
    for entry in range(len(article)):
        # Remove punctuation & non-alphanumerics
        tokens = remove_punct(article[entry])
        # Stemmatize & Store results
        stemmed_data = stem(tokens)
        corpus.append(stemmed_data)
    return corpus

In [42]:
word_corpus = pre_process(body_data)
word_corpus[:5]

['good luck find reason price band replac order band dealer sold one els town could get one skagen sent wrong one guess tri allow anyon els make band uniqu watch design seem stupid certain never buy one',
 'get dark first week never buy item buy',
 'dont know mine mistak clear state aqua im confus mine lime green hate lime green irrit peopl hate order amazon ive spent dollar latey one make want order least much money annoy think u order someth get someth els well go trash',
 'rod hold bracelet broke sever time compani fix sit drawer come see jewel tri fix one time buy realli buy headach',
 'bought watch one watch doesnot work watch run time slow minut backward outward watch look beauti doesnot show time know sell kind watch onlin wast money time bought watch']