# Setup

In [1]:
%cd ..

/home/pdona/VariousExercises/SentimentAnalysisAmazonMerchant


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ProductReviewScraper.amazon_review_database as db


In [None]:
from SentimentAnalysis.preprocess import load_dataframe_asin, make_wordcloud, make_productlist, make_datasets


In [None]:
make_productlist()

In [None]:
make_datasets()

### Load data from database

# Basic EDA

### First inspection of the data

We look the the datatypes and the number of entries

In [None]:
df.info()

### Ratings distribution

In [None]:
review_rating = df['review_rating'].value_counts().sort_index()


In [None]:
ax = sns.barplot(x=review_rating.index, y=review_rating)
ax.set(title="Ratings ditribution",xlabel="Rating",ylabel="Count");


In [None]:
wc = make_wordcloud(df.query("review_rating < 3")["review_content"])
# Display the generated image:
ax = plt.figure(figsize=(15, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")


In [None]:
tmp_df = df.copy()
tmp_df['review_date'] = pd.to_datetime(tmp_df['review_date'])
C_df = tmp_df.groupby(pd.Grouper(key='review_date', freq='W')).count()
R_df = tmp_df.groupby(pd.Grouper(key='review_date', freq='W')).mean()


In [None]:
test21 = R_df[R_df.index.year == 2021]
test20 = R_df[R_df.index.year == 2020]
test19 = R_df[R_df.index.year == 2019]
test18 = R_df[R_df.index.year == 2018]


In [None]:
sns.lineplot(x=test21.index, y=test21['review_rating'])
sns.lineplot(x=test20.index, y=test20['review_rating'])
sns.lineplot(x=test19.index, y=test19['review_rating'])
sns.lineplot(x=test18.index, y=test18['review_rating'])


There is a huge unbalance between High rating and the rest

### Helpful vote

We already know that there are 90K-ish entries without a helpful vote. We make a categorical feature that tells us if there is a vote or not.

In [None]:
df['review_helpful'] = df['review_helpful_vote'].apply(lambda x : 1 if x > 0 else 0)

In [None]:
df['review_helpful'].value_counts()


In [None]:
fig, ax = plt.subplots(nrows=1,
                       ncols=2, figsize=(20, 8))
sns.countplot(data=df, x="review_helpful",ax=ax[0])
sns.countplot(data=df, x='review_helpful',
              hue='review_rating', ax=ax[1])
ax[0].set(title="Helpful review ditribution", xlabel="Helpful",
          ylabel="Count", xticklabels=("No", "Yes"))
ax[1].set(title="Helpful review ditribution (divided by rating)", xlabel="Helpful",
          ylabel="Count", xticklabels=("No", "Yes"))


Among the helpful reviews the difference among ratings is way smaller than non helpful reviews.

### Review content

We perform the text analysis later. We supplement the content with the length of the review 

In [None]:
df['review_length'] = df['review_content'].apply(len)


In [None]:
fig, ax = plt.subplots(nrows=6,
                       ncols=1, figsize=(20, 20))
sns.histplot(data=df.query("review_rating == 5"), x="review_length", ax=ax[0])
sns.histplot(data=df.query("review_rating == 4"), x="review_length", ax=ax[1])
sns.histplot(data=df.query("review_rating == 3"), x="review_length", ax=ax[2])
sns.histplot(data=df.query("review_rating == 2"), x="review_length", ax=ax[3])
sns.histplot(data=df.query("review_rating == 1"), x="review_length", ax=ax[4])
sns.histplot(data=df, x="review_length", hue="review_rating", ax=ax[5],bins = 15)


In [None]:
df.groupby("review_rating").agg({"review_length": ['mean','median','min','max']})

In [None]:
len(df.query("review_length == 0"))

Looks like average ratings tends to be better argumented. 1 and 5 reviews are the easy to go for less argumented reviews. The 0 lengths are a very small percentage. The longest review is rated 5.

In [None]:
sns.scatterplot(data=df, x="review_rating",y="review_length")


## Polarity

Polarity is a metric that "measures" the positivity of the statement. Float between -1 and 1.

In [None]:
from textblob import TextBlob
def pol(x): return TextBlob(x).sentiment.polarity
# depending on the size of your data, this step may take some time.
df['review_polarity'] = df['review_content'].apply(pol)


In [None]:
sns.histplot(data=df, x="review_polarity",bins=30)


There is an excess of 0 polarity reviews, the average is positive.

In [None]:
sns.boxenplot(data=df, x="review_rating", y="review_polarity")


It looks like there is a correlation between polarity of the review and the rating. However, there are many outliers. Some 5 rated reviews have very negative polarity and some 1 rated reviews have very positive polarity. 

Let's examine these outliers

In [None]:
df.query("review_rating == 1 and review_polarity == 1").sample(
    10)['review_content'].tolist()

The majority of the positivity here it beacuse of comparison to another product (or the same product but `before`) that was positively rated.

In [None]:
df.query("review_rating == 5 and review_polarity == -1").head()['review_content'].tolist()

These are very phew. There is one case of possible mislabel. One positive review with negatice gergon and 3 comparisons.

## Subjectivity

Subjectivity is used for individual sentences to determine whether a sentence expresses an opinion or no. It is a float between 0 and 1. When it is close to 0, it is more about facts. When subjectivity increases, it comes close to be an opinion.

In [None]:
def sub(x): return TextBlob(x).sentiment.subjectivity
# depending on the size of your data, this step may take some time.
df['review_subjectivity'] = df['review_content'].apply(sub)


In [None]:
sns.histplot(data=df, x="review_subjectivity", bins=30)


In [None]:
sns.boxenplot(data=df, x="review_rating", y="review_subjectivity")

It seems that subjectivity is not that informative sicne the variance looks very large. 

In [None]:
tmp_df = df.copy()
tmp_df['review_date'] = pd.to_datetime(tmp_df['review_date'])
test_df = tmp_df.groupby(pd.Grouper(key='review_date', freq='M')).count()

In [None]:
test_df.review_content.cumsum()


In [None]:
sns.lineplot(data=test_df,x="review_date",y="review_rating")

In [None]:
plt.figure(figsize=(15, 8))
ax=sns.countplot(data=df, x="product_asin",
              order=df["product_asin"].value_counts().index,color=sns.color_palette()[0])
ax.set_yscale("log")
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);


In [None]:
plt.figure(figsize=(15, 8))
ax = sns.boxplot(data=df, x="product_asin", y="review_rating",
                 order=df["product_asin"].value_counts().index, color=sns.color_palette()[0])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);


In [None]:
from nltk.corpus import stopwords
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('vader_lexicon')

In [None]:
raw_tokens = len(
    [w for t in (df["review_content"].apply(word_tokenize)) for w in t])
print('Number of raw tokens: {}'.format(raw_tokens))


In [None]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
# Define function to expand contractions
import re
def expand_contractions(text):
    contractions_pattern = re.compile(f"({'|'.join(CONTRACTION_MAP.keys())})", flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = CONTRACTION_MAP.get(match)\
            if CONTRACTION_MAP.get(match)\
            else CONTRACTION_MAP.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


In [None]:
def normalize_text(text: str) -> str:
    tmp_text = text.lower()
    tmp_text = expand_contractions(tmp_text)
    tmp_text = unicodedata.normalize('NFKD', tmp_text).encode(
        'ascii', 'ignore').decode('utf-8', 'ignore')
    tmp_text = re.sub(r'[^a-zA-z\s]', ' ', tmp_text)

    words = nltk.word_tokenize(tmp_text)

    stopword_list = stopwords.words('english')+['the', 'a', 'an', 'i', 'he', 'she', 'they', 'to', 'of', 'it', 'from']
    stopword_list.remove('no')
    stopword_list.remove('not')
    words = [w for w in words if w not in stopword_list]

    # lemmatizer = WordNetLemmatizer()
    # lemmas = [lemmatizer.lemmatize(w, pos='v') for w in words]
    return ' '.join(words)


In [None]:
df['review_content_clean'] = df['review_content'].apply(normalize_text)


In [None]:
df['review_title_clean'] = df['review_title'].apply(normalize_text)

In [None]:
from wordcloud import WordCloud
positivewords = list(df.query('review_polarity > 0.05')['review_content_clean'])
positivetext = " ".join(positivewords)
# Create the wordcloud object
wordcloud = WordCloud(width=800, height=600).generate(positivetext)

# Display the generated image:
ax = plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")


In [None]:
df["product_asin"].sample(1)

The majority of the words are referring to what the product is? Maybe we can find an improvement if we subtract the product description and brand?

In [None]:
def subtract_product_name(content:str,name:str)->str:
    norm_name = normalize_text(name)
    name_list = nltk.word_tokenize(norm_name)
    words = nltk.word_tokenize(content)
    subtraction = [word for word in words if word not in name_list]
    return " ".join(subtraction)

In [None]:
df["review_content_clean_wotitle"] = df.apply(lambda d: subtract_product_name(
    d["review_content"], d["product_name"]), axis="columns")


In [None]:
df["review_title_clean_wotitle"] = df.apply(lambda d: subtract_product_name(
    d["review_title"], d["product_name"]), axis="columns")


In [None]:
df["review_content_clean_wotitle"].head()

In [None]:
positivewords = list(df.query('review_polarity > 0.05')
                     ['review_content_clean_wotitle'])
positivetext = " ".join(positivewords)
# Create the wordcloud object
wordcloud = WordCloud(width=800, height=600).generate(positivetext)

# Display the generated image:
ax = plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")


In [None]:
positivewords = list(df.query('review_polarity < 0.05')
                     ['review_content_clean_wotitle'])
positivetext = " ".join(positivewords)
# Create the wordcloud object
wordcloud = WordCloud(width=800, height=600).generate(positivetext)

# Display the generated image:
ax = plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")


In [None]:
df.to_csv('SentimentAnalysis/data/AnkerAmazonReviews.csv')