# 🛍️ e-Commerce: NLP and Sentiment Analysis Part 1

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataset = pd.read_csv('../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv').iloc[:,1:]
dataset.head()

In [None]:
dataset.info()

In [None]:
plt.bar(dataset['Rating'].value_counts().index, list(dataset['Rating'].value_counts()))
for index, value in enumerate(list(dataset['Rating'].value_counts().sort_values())):
    plt.text(index+1, value+100, str(value))
plt.title('rating')
plt.show()

plt.bar(dataset['Recommended IND'].value_counts().index.astype(int), list(dataset['Recommended IND'].value_counts()))
for index, value in enumerate(list(dataset['Recommended IND'].value_counts().sort_values())):
    plt.text(index, value, str(value))
plt.title('Recommended IND')
plt.show()

plt.bar(dataset['Division Name'].value_counts().index, list(dataset['Division Name'].value_counts()))
for index, value in enumerate(list(dataset['Division Name'].value_counts())):
    plt.text(index, value, str(value))
plt.title('Division Name')
plt.show()

plt.bar(dataset['Department Name'].value_counts().index, list(dataset['Department Name'].value_counts()))
for index, value in enumerate(list(dataset['Department Name'].value_counts())):
    plt.text(index, value, str(value))
plt.title('Department Name')
plt.show()

plt.figure(figsize=(16,4))
plt.bar(dataset['Class Name'].value_counts().index, list(dataset['Class Name'].value_counts()))
for index, value in enumerate(list(dataset['Class Name'].value_counts())):
    plt.text(index, value, str(value))
plt.title('Class Name')
plt.xticks(rotation=90)
plt.show()

# 1. Regular Expression

In [None]:
import re

In [None]:
text = 'The monkeys are eating 7 bananas on the tree!'
print(re.match('\w+', text)) # word
print(re.match('\d+', text)) # digit
print(re.match('\s', text)) # space
print(re.match('[a-z]+', text)) # alphabet
print(re.match('[A-Z]+', text)) # big caps alphabet
print(re.match('(\w+|\d+)', text)) # word or digit

In [None]:
print(re.split('\w+', text)) # word
print(re.split('\d+', text)) # digit
print(re.split('\s', text)) # space
print(re.split('[a-z]+', text)) # small caps alphabet
print(re.split('[A-Z]+', text)) # big caps alphabet
print(re.split('(\w+|\d+)', text)) # word or digit

In [None]:
print(re.findall('\w+', text)) # word
print(re.findall('\d+', text)) # digit
print(re.findall('\s', text)) # space
print(re.findall('[a-z]+', text)) # small caps alphabet
print(re.findall('[a-z]', text)) # small caps alphabet
print(re.findall('[A-Z]+', text)) # big caps alphabet
print(re.findall('(\w+|\d+)', text)) # word or digit

In [None]:
print(re.search('\w+', text)) # word
print(re.search('\d+', text)) # digit
print(re.search('\s', text)) # space
print(re.search('[a-z]+', text)) # small caps alphabet
print(re.search('[a-z]', text)) # small caps alphabet
print(re.search('[A-Z]+', text)) # big caps alphabet
print(re.search('(\w+|\d+)', text)) # word or digit

# 2. Word Tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
word_tokenize(text)

In [None]:
text2 = 'The monkeys are eating 7 bananas on the tree! The tree will only have 5 bananas left later. One monkey is jumping to another tree.'
sent_tokenize(text2)

# 3. Named Entity Recognition (NER)

In [None]:
text_tag = word_tokenize(text2)
nltk.pos_tag(text_tag)

CC coordinating conjunction; CD cardinal digit; DT determiner; EX existential e.g. "there is"; FW foreign word; IN preposition/subordinating conjunction; JJ adjective; JJR comparative adjective; JJS superlative adjective; LS list marker; MD modal; NN noun; singular; NNS plural noun; NNP singular proper noun; NNPS plural proper noun; PDT predeterminer; POS possessive ending; PRP personal pronoun; PRP💲 possessive pronoun; RB adverb; RBR comparative adverb; RBS superlative adverb; RP particle; TO to; UH interjection; VB base form verb; VBD past tense verb; VBG gerund/present participle verb; VBN past participle verb; VBP verb for non-3rd person singular present;  VBZ verb for 3rd person singular present; WDT wh-determiner; WP wh-pronoun; WP💲 possessive wh-pronoun; WRB wh-abverb

In [None]:
import spacy

In [None]:
spa = spacy.load("en_core_web_sm")
spa_text = spa(text)

print('text' + '\t' + 'lemmatized' + '\t' + 'PoS' + '\t' + 'tag' + '\t' + 'dep' + '\t' +
          'shape' + '\t' + 'is_alphabet' + '\t' + 'is_stop_words')

for word in spa_text:
    print(word.text + '\t' + word.lemma_ + '\t\t' + word.pos_ + '\t' + word.tag_ + '\t' + word.dep_ + '\t' + word.shape_ + '\t' + str(word.is_alpha) + '\t\t' + str(word.is_stop))

# 4. Word Cloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
word_clo = WordCloud(stopwords=set(STOPWORDS)).generate(text2)
plt.imshow(word_clo, interpolation='bilinear')
plt.show()

# 5. Stemming and Lemmatization

In [None]:
# Stemming
from nltk.stem import PorterStemmer
print(PorterStemmer().stem('studies'))

# Lemmatization
from nltk.stem import WordNetLemmatizer
print(WordNetLemmatizer().lemmatize('studies'))

In [None]:
# Lemmatize a sentence
lemm_text = [WordNetLemmatizer().lemmatize(word) for word in word_tokenize(text)]
lemm_text

# 6. Bag-of-words (BoW)

In [None]:
from collections import Counter

In [None]:
Counter(word_tokenize(text2))

In [None]:
Counter(word_tokenize(text2)).most_common(3)

In [None]:
from nltk.corpus import stopwords

In [None]:
text2_stop = [word for word in word_tokenize(text2.lower())  if word.isalpha() if word not in stopwords.words('english')]
Counter(text2_stop)

Analyze the datasets

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

In [None]:
# Filter rows with column
data = dataset.loc[dataset['Review Text'].notnull(),:]

# Apply uni- and bigram vectorizer
class lemmatizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, df):
        return [self.wnl.lemmatize(word) for word in word_tokenize(df)]

vectorizer = CountVectorizer(max_features=100, max_df=0.95, min_df=0.1, ngram_range=(1,2),
                             tokenizer=lemmatizer(), lowercase=True, stop_words='english',
                             token_pattern = r'\w+')

vectorizer.fit(data['Review Text'])
count_vector = vectorizer.transform(data['Review Text'])
count_vector

In [None]:
# Transform into data frame
bow = count_vector.toarray()
bow = pd.DataFrame(bow, columns=vectorizer.get_feature_names())
bow.head()

In [None]:
visual_rev = WordCloud().generate(' '.join(data['Review Text']))
plt.figure(figsize=(8,8))
plt.imshow(visual_rev, interpolation='bilinear')
plt.show()

# 7. Term Frequency — Inverse Document Frequency (Tf-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=100)
tfidf.fit(data['Review Text'])
tfidf_data = tfidf.transform(data['Review Text'])
tfidf_data

In [None]:
tfidf_data = pd.DataFrame(tfidf_data.toarray(), columns=tfidf.get_feature_names())
tfidf_data.head()

# 8. Sentiment Analysis

In [None]:
from textblob import TextBlob

In [None]:
TextBlob(text2).sentiment

In [None]:
# Applying text blob sentiment
def polarity(t):
    a = TextBlob(t).sentiment
    return a[0]

def subjectivity(t):
    a = TextBlob(t).sentiment
    return a[1]

data['polarity'] = data.apply(lambda t: polarity(t['Review Text']), axis=1)
data['subjectivity'] = data.apply(lambda t: subjectivity(t['Review Text']), axis=1)
data.head()

In [None]:
# Polarity and Rating
sns.boxplot(data=data, x='Rating', y='polarity')
plt.show()