## Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import re
import json

Creating Review class to organize the Reviews.

In [2]:
class Review:
    
    def __init__(self, text, rating):
        self.text = text
        self.rating = rating
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.rating > 3:
            return 'Positive'
        elif self.rating < 3:
            return 'Negative'
        else:
            return 'Neutral'

In [3]:
# Dataset path
file_name = 'D:/Git_Projects/SentimentAnalysis/Datasets/Books_small_10000.json'

In [4]:
#Fetching and storing reviews
reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [5]:
# Converting data into a Dataframe to preprocess it in an optimal way.
df_reviews = pd.DataFrame({'Review': [reviews[i].text for i in range(len(reviews))],
                            'Ratings': [reviews[i].rating for i in range(len(reviews))],
                            'Sentiment': [reviews[i].sentiment for i in range(len(reviews))]})

### Balancing Dataset

Balancing the number of reviews w.r.t sentiment.

In [6]:
import random
def balance_reviews(reviews):
    
    balance_value = df_reviews['Sentiment'].value_counts()['Negative']
    positive_counter = 0
    negative_counter = 0
    balanced_reviews = []
    
    for i in range(len(df_reviews)):
        if (df_reviews['Sentiment'][i] == 'Positive') and (positive_counter < balance_value):
            balanced_reviews.append(df_reviews.iloc[i, :])
            positive_counter += 1
            
        elif (df_reviews['Sentiment'][i] == 'Negative') and (negative_counter < balance_value):
            balanced_reviews.append(df_reviews.iloc[i, :])
            negative_counter += 1
        
        else:
            pass
    
    return pd.DataFrame(balanced_reviews)

In [7]:
balanced_df = balance_reviews(df_reviews)

In [9]:
balanced_df.head()

Unnamed: 0,Review,Ratings,Sentiment
0,"I bought both boxed sets, books 1-5. Really a...",5.0,Positive
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0,Positive
3,I really enjoyed this adventure and look forwa...,4.0,Positive
5,I hoped for Mia to have some peace in this boo...,5.0,Positive
6,The book has the fevered intensity of Oliver S...,2.0,Negative


## Pre-Processing the Reviews
### Subtasks:
1) Converting alphabets into lowercase.
2) Tokenizing the review text, i.e. converting the words present in the review into a list.
3) Removing Stopwords from the review, where these stopwords do not add value to predicting the nature of a review, like I, me.
4) Converting the words in past, present, and future, continuous and various tense to one single tense.

Step No. 3 & 4 reduces the size of our metrics (where column_names of the metrics contains a single word, rows contains the review number and the data field contains the number of times a particular word is present inside a review. )

In [14]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
def preprocessed_reviews(reviews):
    '''
    Input:
    reviews: a list of reviews, where every review is in a textual form.
    
    Output:
    review_list: Reviews after going through preprocessing steps.
    '''
    review_list = []
    non_alphanumeric = re.compile('[\W]+')
    lancaster = LancasterStemmer()

    for review in reviews:
        lower = review.lower()
        clean_review = non_alphanumeric.sub(' ', lower)
        words = word_tokenize(clean_review)
        stemmed_words = []
        
        for word in words:
            if word not in stopwords.words('english'):
                stemmed_words.append(lancaster.stem(word))
            else:
                pass
        
        review_list.append(' '.join(stemmed_words))
        
    return review_list 
        
        

In [13]:
# Creating a column named Preprocessed_Reviews which contains the reviews preprocessed using the preprocessed_reviews function.
balanced_df['Preprocessed_Reviews'] = preprocessed_reviews(balanced_df['Review'])

In [19]:
final_df = pd.get_dummies(balanced_df, columns=['Sentiment'], drop_first=True)
final_df.head()

Unnamed: 0,Review,Ratings,Preprocessed_Reviews,Sentiment_Positive
0,"I bought both boxed sets, books 1-5. Really a...",5.0,bought box set book 1 5 real gre sery start bo...,1
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0,lov nichola spark 8217 read everyth 8217 writ ...,1
3,I really enjoyed this adventure and look forwa...,4.0,real enjoy adv look forward read robert spir e...,1
5,I hoped for Mia to have some peace in this boo...,5.0,hop mia peac book story real raw brok world to...,1
6,The book has the fevered intensity of Oliver S...,2.0,book fev intens ol ston movy jfk auth track de...,0


### CountVectorize

CountVector counts the number of words present in text. (Which word, in the text, is present and how many time)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

countvector = CountVectorizer()
X = countvector.fit_transform(final_df['Preprocessed_Reviews'])

In [22]:
sparse_df = pd.DataFrame(X.toarray(), columns=countvector.get_feature_names())

## Machine Learning

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(sparse_df, final_df['Sentiment_Positive'], test_size = 0.1, random_state=0)
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train).reshape(-1, 1)
Y_test = np.array(Y_test).reshape(-1, 1)

In [24]:
Y_train.shape

(1159, 1)

## LogisticRegression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [26]:
logreg = LogisticRegression(C=0.1)

In [27]:
logreg.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
f1_score(logreg.predict(X_test), Y_test)

0.8474576271186439

## Training the data inside a Deep Neural Network.

In [29]:
from tensorflow import keras

In [30]:
model = keras.Sequential([
    keras.layers.Dense(1024, activation='tanh'),
    #keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(2, activation='sigmoid')
])

In [31]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [32]:
model.fit(x=X_train, y=Y_train, batch_size=16, epochs=5, validation_data=(X_test, Y_test))

Train on 1159 samples, validate on 129 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x13911be9788>