1. Read and analyze the input text data and the corresponding response variables (ratings).

2. Perform basic pre-processing to prepare the data for modeling.

3. Learn and apply various ways of featurizing the reviews text.

4. Build machine learning models to classify text as either exhibiting positive or negative sentiment (1 or 0).

In [10]:
import numpy as np
import pandas as pd
import re

In [2]:
class Sentiment:
    Negative = "Negative"
    Positive = "Positive"
    Neutral = "Neutral"
    
    
class Review:
    
    def __init__(self, text, rating):
        self.text = text
        self.rating = rating
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.rating > 3:
            return Sentiment.Positive
        if self.rating <= 2:
            return Sentiment.Negative
        else:
            return Sentiment.Neutral

In [3]:
import json

file_name = 'D:/Git_Projects/SentimentAnalysis/Datasets/Books_small_10000.json'

reviews = []

with open(file_name) as f:
    for line in f: 
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        

In [4]:
len(reviews)

10000

In [5]:
print(reviews[0].text.lower())
print(reviews[0].sentiment)


i bought both boxed sets, books 1-5.  really a great series!  start book 1 three weeks ago and just finished book 5.  sloane monroe is a great character and being able to follow her through both private life and her pi life gets a reader very involved!  although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  these are books you won't be disappointed with.
Positive


In [8]:
df_reviews = pd.DataFrame({'Review': [reviews[i].text for i in range(len(reviews))],
              'Rating': [reviews[i].rating for i in range(len(reviews))],
                          'Sentiment': [reviews[i].sentiment for i in range(len(reviews))]})

In [11]:
df_reviews['Review'] = df_reviews['Review'].apply(lambda x: x.lower())
non_alphanumeric = re.compile('[\W]+')
df_reviews['Review'] = df_reviews['Review'].apply(lambda x: non_alphanumeric.sub(' ', x))

In [12]:
df_reviews['Rating'].value_counts()

5.0    5930
4.0    2448
3.0     978
2.0     375
1.0     269
Name: Rating, dtype: int64

In [14]:
df_reviews['Sentiment'].value_counts()

Positive    8378
Neutral      978
Negative     644
Name: Sentiment, dtype: int64

# Pre-Processing

1. Converting words to lower case.
2. Removing special characters.
3. Removing stopwards and high/low-frequency words.
4. Stemming/lemmatization.

In [15]:
#import re
#reexp1 = re.compile('[\W]+')
#reexp1.sub(' ', reviews[0].text)

In [16]:
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.corpus import wordnet

def get_preprocessed_review(review):
    
    non_alphanumeric = re.compile('[\W]+')
    
    lower = review.lower()
    alphanumeric_review = non_alphanumeric.sub(' ', lower)

    words_without_stopwords = []
    words = word_tokenize(alphanumeric_review)
    for word in words:
        if word not in stopwords.words('english'):
            words_without_stopwords.append(word)
        else:
            pass

        
    return words_without_stopwords

In [17]:
df2_reviews = pd.DataFrame({'Review': [reviews[i].text for i in range(len(reviews))],
              'Rating': [reviews[i].rating for i in range(len(reviews))],
                          'Sentiment': [reviews[i].sentiment for i in range(len(reviews))]})

In [18]:
df2_reviews['Sentiment'].value_counts()

Positive    8378
Neutral      978
Negative     644
Name: Sentiment, dtype: int64

In [19]:
#pd.DataFrame([df2_reviews.loc[0, :], df2_reviews.loc[1, :]])

In [20]:
def get_balanced_reviews(reviews):
    
    positive_counter = 0
    negative_counter = 0
    neutral_counter = 0
    list_balanced_reviews = []

    for i in range(len(df2_reviews['Sentiment'])):
        if df2_reviews.loc[i, 'Sentiment'] == 'Positive' and positive_counter < 700:
            list_balanced_reviews.append(df2_reviews.loc[i, :])
            positive_counter += 1

        elif df2_reviews.loc[i, 'Sentiment'] == 'Negative' and negative_counter < 700:
            list_balanced_reviews.append(df2_reviews.loc[i, :])
            negative_counter += 1

        elif df2_reviews.loc[i, 'Sentiment'] == 'Neutral' and neutral_counter < 700:
            list_balanced_reviews.append(df2_reviews.loc[i, :])
            neutral_counter += 1

        else:
            pass

    return pd.DataFrame(list_balanced_reviews)

In [21]:
df_balanced_reviews = get_balanced_reviews(reviews)

In [22]:
df_balanced_reviews.head()

Unnamed: 0,Review,Rating,Sentiment
0,"I bought both boxed sets, books 1-5. Really a...",5.0,Positive
1,I enjoyed this short book. But it was way way ...,3.0,Neutral
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0,Positive
3,I really enjoyed this adventure and look forwa...,4.0,Positive
4,It was a decent read.. typical story line. Not...,3.0,Neutral


In [23]:
df_balanced_reviews['Review_words'] = df_balanced_reviews['Review'].apply(get_preprocessed_review)

In [25]:
df_balanced_reviews.head()

Unnamed: 0,Review,Rating,Sentiment,Review_words
0,"I bought both boxed sets, books 1-5. Really a...",5.0,Positive,"[bought, boxed, sets, books, 1, 5, really, gre..."
1,I enjoyed this short book. But it was way way ...,3.0,Neutral,"[enjoyed, short, book, way, way, short, see, e..."
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0,Positive,"[love, nicholas, sparks, 8217, read, everythin..."
3,I really enjoyed this adventure and look forwa...,4.0,Positive,"[really, enjoyed, adventure, look, forward, re..."
4,It was a decent read.. typical story line. Not...,3.0,Neutral,"[decent, read, typical, story, line, nothing, ..."


In [143]:
wordlistt = []
for review_words in df_balanced_reviews['Review_words']:
    #wordlistt = wordlistt.append(for i in range)
    for wrd in review_words:
        wordlistt.append(wrd)
        
print('Total words without using Stemmatization:- ',len(set(wordlistt)))


stemmed_words = []

for wrd in wordlistt:
    stemmed_words.append(lancaster.stem(wrd))

print('Total words using Stemmetization:- ', len(set(stemmed_words)))

Total words without using Stemmatization:-  14516
Total words using Stemmetization:-  8684


NameError: name 'lancaster' is not defined

'bought'

In [29]:
lancaster = LancasterStemmer()

def do_stem(review):
    stemmed_words = []
    for word in review:
        stemmed_words.append(lancaster.stem(word))
    return stemmed_words

In [30]:
df_balanced_reviews['Stemmed_Words'] = df_balanced_reviews['Review_words'].apply(do_stem)

In [31]:
df_balanced_reviews.head()

Unnamed: 0,Review,Rating,Sentiment,Review_words,Stemmed_Words
0,"I bought both boxed sets, books 1-5. Really a...",5.0,Positive,"[bought, boxed, sets, books, 1, 5, really, gre...","[bought, box, set, book, 1, 5, real, gre, sery..."
1,I enjoyed this short book. But it was way way ...,3.0,Neutral,"[enjoyed, short, book, way, way, short, see, e...","[enjoy, short, book, way, way, short, see, eas..."
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0,Positive,"[love, nicholas, sparks, 8217, read, everythin...","[lov, nichola, spark, 8217, read, everyth, 821..."
3,I really enjoyed this adventure and look forwa...,4.0,Positive,"[really, enjoyed, adventure, look, forward, re...","[real, enjoy, adv, look, forward, read, robert..."
4,It was a decent read.. typical story line. Not...,3.0,Neutral,"[decent, read, typical, story, line, nothing, ...","[dec, read, typ, story, lin, noth, unsav, many..."


In [39]:
df_balanced_reviews['Preprocessed_Review'] = df_balanced_reviews['Stemmed_Words'].apply(lambda x: ' '.join(x))

In [42]:
df_balanced_reviews.head()

Unnamed: 0,Review,Rating,Sentiment,Review_words,Stemmed_Words,Preprocessed_Review
0,"I bought both boxed sets, books 1-5. Really a...",5.0,Positive,"[bought, boxed, sets, books, 1, 5, really, gre...","[bought, box, set, book, 1, 5, real, gre, sery...",bought box set book 1 5 real gre sery start bo...
1,I enjoyed this short book. But it was way way ...,3.0,Neutral,"[enjoyed, short, book, way, way, short, see, e...","[enjoy, short, book, way, way, short, see, eas...",enjoy short book way way short see easy would ...
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0,Positive,"[love, nicholas, sparks, 8217, read, everythin...","[lov, nichola, spark, 8217, read, everyth, 821...",lov nichola spark 8217 read everyth 8217 writ ...
3,I really enjoyed this adventure and look forwa...,4.0,Positive,"[really, enjoyed, adventure, look, forward, re...","[real, enjoy, adv, look, forward, read, robert...",real enjoy adv look forward read robert spir e...
4,It was a decent read.. typical story line. Not...,3.0,Neutral,"[decent, read, typical, story, line, nothing, ...","[dec, read, typ, story, lin, noth, unsav, many...",dec read typ story lin noth unsav many slic li...


## Bag-of-Words

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

countvector = CountVectorizer()

X = countvector.fit_transform(df_balanced_reviews['Preprocessed_Review'])

In [45]:
sparse_df = pd.DataFrame(X.toarray(), columns=countvector.get_feature_names())

In [46]:
sparse_df

Unnamed: 0,00,000,02,03,04,07,10,100,1000,10084,...,zip,zippo,zoh,zolt,zomby,zon,zoo,zoom,zor,zzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
2040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(sparse_df, )