# Sentiment Analysis - CP322
## Riley Huston (190954880) | Samson Goodenough (190723380) | Shailendra Singh ()

In [None]:
# import libraries
import nltk
import pandas as pd
import sklearn
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

# visualization
import matplotlib.pyplot as plt
from nltk.probability import FreqDist

# download dependenciesquirements
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('treebank')
nltk.download('tagsets')
nltk.download('vader_lexicon')

print('Finished downloading')

In [None]:
# import data
df = pd.read_csv('reviews.csv')

# remove all reviews with no positive feedback to remove potential spam and unhelpful reviews
df = df[df['Positive Feedback Count'] > 0]

# remove all null and unnecessary features for reviews
reviews = df.drop(labels=['Clothing ID', 'Title'], axis=1)
reviews.dropna(inplace=True)
reviews.head()
reviews.shape

In [None]:
# remove all null and unnecessary features for titles
titles = df.drop(labels=['Clothing ID', 'Review Text'], axis=1)
titles.dropna(inplace=True)
titles.head()

In [None]:
# Tokenize all reviews into words
corpusReview = []
for review in reviews['Review Text']:
    corpusReview.append(word_tokenize(review))

# make stopwords and prepare for stemming
stop_words=set(stopwords.words("english"))
stem = PorterStemmer()

# Filter out all the stopwords and stem the words
filteredCorpusReview = []
flattenedCorpusReview = []
for i in range(len(corpusReview)):
    filteredCorpusReview.append([])
    for token in corpusReview[i]:
        if token not in stop_words and not re.match(r'^[_\W0-9]+$', token): # remove stop words and single special character words
            filteredCorpusReview[i].append(stem.stem(token))
            flattenedCorpusReview.append(stem.stem(token))
    filteredCorpusReview[i] = ' '.join(filteredCorpusReview[i])
    

In [None]:
# Tokenize all Titles into words
corpusTitle = []
for title in titles['Title']:
    corpusTitle.append(word_tokenize(title))

# Filter out all the stopwords and stem the words
filteredCorpusTitle = []
flattenedCorpusTitle = []
for i in range(len(corpusTitle)):
    filteredCorpusTitle.append([])
    for token in corpusTitle[i]:
        if token not in stop_words and not re.match(r'^[_\W0-9]+$', token): # remove stop words and single special character words
            filteredCorpusTitle[i].append(stem.stem(token))
            flattenedCorpusTitle.append(stem.stem(token))
    filteredCorpusTitle[i] = ' '.join(filteredCorpusTitle[i])
    

### Visualization

In [None]:
# Visualize the frequency of words in reviews
fdistReview = FreqDist(flattenedCorpusReview)
print(fdistReview)

# Plot the frequency of review words
fdistReview.plot(30,cumulative=False)
plt.show()

In [None]:
# Visualize the frequency of words in reviews
fdistTitle = FreqDist(flattenedCorpusTitle)
print(fdistTitle)

# Plot the frequency of review words
fdistTitle.plot(30,cumulative=False)
plt.show()

print(len(filteredCorpusTitle))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
cvr = CountVectorizer(lowercase=True,ngram_range = (1,1))
trainingReview = cvr.fit_transform(filteredCorpusReview).toarray()


In [None]:
#tokenizer to remove unwanted elements from out data like symbols and numbers
cvt = CountVectorizer(lowercase=True,ngram_range = (1,1))
trainingTitle = cvt.fit_transform(filteredCorpusTitle).toarray()

In [None]:
# Create a dataframe to visualize the bag of words model
dfr = pd.DataFrame(data=trainingReview,columns = cvr.get_feature_names_out())
dfr

In [None]:
# Create a dataframe to visualize the bag of words model
dft = pd.DataFrame(data=trainingTitle,columns = cvt.get_feature_names_out())
dft

### Trial Run With User Generated Ratings

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainingReview, reviews['Rating'], test_size=0.3)

In [None]:
from sklearn.naive_bayes import MultinomialNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(filteredCorpusReview).toarray()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_tf, reviews['Rating'], test_size=0.3)

In [None]:
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

In [None]:
# Helps map sentiment values to rating values
def map(value, minFrom, maxFrom, minTo, maxTo):
    return (value - minFrom) * (maxTo - minTo) / (maxFrom - minFrom) + minTo

In [None]:
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()

for review in filteredCorpusReview:
    print(review)
    ss = sid.polarity_scores(review)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()


In [None]:
for review in reviews["Review Text"]:
    print(review)
    ss = sid.polarity_scores(review)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()

In [None]:
rpusus