In [None]:
import numpy as np
import pandas as pd

In [None]:
pd.set_option('display.max_columns',None)
pd.set_option('display.min_rows',0)
pd.set_option('display.max_rows',None)

## Read in DFs

In [None]:
main = pd.read_csv(r'C:\Users\TRW\Documents\NYCDSA\Selenium\Web_Scraping_Project\main_amazon_1page.csv')
main.sample(5)

In [None]:
# Read in DFs
sortd = pd.read_csv(r'C:\Users\TRW\Documents\NYCDSA\Selenium\Web_Scraping_Project\sorted_amazon_1page.csv')
sortd.sample(5)

## Remove Currency Symbols 

In [None]:
main['price'] = main['price'].str.replace('$', '').str.replace('CDN', '').str.replace('₹', '') \
                .str.replace('S', '').str.replace('AED', '').str.replace('£','').str.replace(',','')
main.sample(5)

In [None]:
sortd['price'] = sortd['price'].str.replace('$', '').str.replace('CDN', '').str.replace('₹', '') \
                .str.replace('S', '').str.replace('AED', '').str.replace('£','').str.replace(',','')
sortd.sample(5)

## Convert Price Column To Floats

In [None]:
main['price'] = main['price'].astype(float)
sortd['price'] = sortd['price'].astype(float)

## Convert All Currency to US Dollar

In [None]:
conv = pd.DataFrame({'country':['Australia','Canada','India','Singapore', \
                                'United Arab Emirates','United Kingdom','United States'],
                      'rate':[0.70931,0.74525,0.01338,0.72225,0.27221,1.27618,1]})

In [None]:
conv

In [None]:
main = pd.merge(main, conv, how='left', on='country')
sortd = pd.merge(sortd, conv, how='left', on='country')

In [None]:
main['price'] = main['price']*main['rate']
sortd['price'] = sortd['price']*sortd['rate']

In [None]:
main = main.drop(['rate'],axis=1)
sortd = sortd.drop(['rate'],axis=1)
main['price'].sample(5)

In [None]:
sortd['price'].sample(5)

## Round currencies to 2nd decimal place

In [None]:
main['price'] = main['price'].apply(lambda f: round(f,2))
sortd['price'] = sortd['price'].apply(lambda f: round(f,2))

## Averages

In [None]:
main.groupby('country').mean().sort_values('rating', ascending=False)

In [None]:
sortd.groupby('country').mean().sort_values('rating', ascending=False)

## Counts of forms per country

In [None]:
main.groupby(['country', 'form']).count()
sortd.groupby(['country', 'form']).count()

## Highest rated books/products from each country (Sort)

In [None]:
main.sort_values("rating", ascending = False, inplace = False)

In [None]:
sortd.sort_values("rating", ascending = False, inplace = False)

## Display Duplicate Books

In [None]:
main[main.title.duplicated(keep=False)]

In [None]:
sortd[sortd.title.duplicated(keep=False)]

## Display Duplicates Alone

In [None]:
main[main.title.duplicated(keep=False)].groupby('title').agg(['count'])

In [None]:
main[main.title.duplicated(keep=False)].groupby('title').agg(['count'])['country'].sort_values('count', ascending=False)

In [None]:
sortd[sortd.title.duplicated(keep=False)].groupby('title').agg(['count'])['country'].sort_values('count', ascending=False)

# NLP

## Preprocessing

In [None]:
default['title']

In [None]:
# Convert all the string to lower cases
titles = default.copy()
titles['title'] = titles['title'].str.lower()
titles['title']

In [None]:
import re
# \S+ means anything that is not an empty space
#titles['title'] = titles['title'].apply(lambda x: re.sub('http\S*', '', x))
#titles['title']

In [None]:
# remove numbers
titles['title'] = titles['title'].apply(lambda x: re.sub('\d+', ' ', x))
titles['title']                 

In [None]:
# \s+ means all empty space (\n, \r, \t)
titles['title'] = titles['title'].apply(lambda x: re.sub('\s+', ' ', x))
titles['title']

## Filtering

In [None]:
# remove all punctuation
titles['title'] = titles['title'].apply(lambda x: re.sub('[^\w\s]', '', x))
titles['title']

In [None]:
# remove unnecessary words
from nltk.corpus import stopwords
stop = stopwords.words('english')
titles['title'] = titles['title'].apply(lambda text: " ".join(word for word in text.split() if word not in stop))
titles['title']

## Tokenization

In [None]:
from textblob import TextBlob
titles['title'] = titles['title'].apply(lambda x: TextBlob(x).words)
titles['title']

In [None]:
#titles

## Stemming

In [None]:
from nltk.stem import PorterStemmer

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

titles['title'] = titles['title'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x]))
titles['title']




In [None]:
titles

## N-Grams

In [None]:
# TextBlob(titles['title']).ngrams(2)

## Word Cloud

In [None]:
# titles[titles.country == "Australia"].title
# titles[titles.country == "Canada"].title
# titles[titles.country == "India"].title
# titles[titles.country == "Singapore"].title
# titles[titles.country == "United Arab Emirates"].title
# titles[titles.country == "United Kingdom"].title
# titles[titles.country == "United States"].title

In [None]:
# from wordcloud import WordCloud
# wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# # generate word cloud
# wc.generate(' '.join(titles[titles.country == "Australia"].title))

In [None]:
# import matplotlib.pyplot as plt
# %matplotlib inline

In [None]:
# show
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wc.generate(' '.join(default.title))

In [None]:
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

## Sentiment Analysis

In [None]:
titles.sample

In [None]:
sample_size = 100

def sentiment_func(x):
    sentiment = TextBlob(x['title'])
    x['polarity'] = sentiment.polarity
    x['subjectivity'] = sentiment.subjectivity
    return x

sample = titles.sample(sample_size).apply(sentiment_func, axis=1)

In [None]:
sample.plot.scatter('country', 'polarity')