In [1]:
import numpy as np
import pandas as pd
import re
import json

Creating Review class to organize the Reviews.

In [2]:
class Review:
    
    def __init__(self, text, rating):
        self.text = text
        self.rating = rating
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.rating > 3:
            return 'Positive'
        elif self.rating < 3:
            return 'Negative'
        else:
            return 'Neutral'

In [3]:
# Dataset path
file_name = 'D:/Git_Projects/SentimentAnalysis/Datasets/Books_small_10000.json'

In [4]:
#Fetching and storing reviews
reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [5]:
# Converting data into a Dataframe to preprocess it in an optimal way.
df_reviews = pd.DataFrame({'Review': [reviews[i].text for i in range(len(reviews))],
                            'Ratings': [reviews[i].rating for i in range(len(reviews))],
                            'Sentiment': [reviews[i].sentiment for i in range(len(reviews))]})

### Balancing Dataset

Balancing the number of reviews w.r.t sentiment.

In [6]:
import random
def balance_reviews(reviews):
    
    balance_value = df_reviews['Sentiment'].value_counts()['Negative']
    positive_counter = 0
    negative_counter = 0
    balanced_reviews = []
    
    for i in range(len(df_reviews)):
        if (df_reviews['Sentiment'][i] == 'Positive') and (positive_counter < balance_value):
            balanced_reviews.append(df_reviews.iloc[i, :])
            positive_counter += 1
            
        elif (df_reviews['Sentiment'][i] == 'Negative') and (negative_counter < balance_value):
            balanced_reviews.append(df_reviews.iloc[i, :])
            negative_counter += 1
        
        else:
            pass
    
    return pd.DataFrame(balanced_reviews)

In [7]:
balanced_df = balance_reviews(df_reviews)

In [8]:
balanced_df

Unnamed: 0,Review,Ratings,Sentiment
0,"I bought both boxed sets, books 1-5. Really a...",5.0,Positive
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0,Positive
3,I really enjoyed this adventure and look forwa...,4.0,Positive
5,I hoped for Mia to have some peace in this boo...,5.0,Positive
6,The book has the fevered intensity of Oliver S...,2.0,Negative
...,...,...,...
9981,I am very picky when it comes to what I like t...,1.0,Negative
9982,*I received a free copy of this book to read a...,2.0,Negative
9984,Having this book toted as a YA series on Amazo...,1.0,Negative
9993,I've tried to start this graphic novel a coupl...,2.0,Negative


In [12]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
def preprocessed_reviews(reviews):
    
    review_list = []
    non_alphanumeric = re.compile('[\W]+')
    lancaster = LancasterStemmer()

    for review in reviews:
        lower = review.lower()
        clean_review = non_alphanumeric.sub(' ', lower)
        words = word_tokenize(clean_review)
        stemmed_words = []
        
        for word in words:
            if word not in stopwords.words('english'):
                stemmed_words.append(lancaster.stem(word))
            else:
                pass
        
        review_list.append(' '.join(stemmed_words))
        
    return review_list 
        
        

In [13]:
balanced_df['Preprocessed_Reviews'] = preprocessed_reviews(balanced_df['Review'])

In [15]:
final_df = balanced_df.copy()

In [32]:
final_df = pd.get_dummies(final_df, columns=['Sentiment'], drop_first=True)

### CountVectorize

CountVector counts the number of words present in text. (Which word, in the text, is present and how many time)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

countvector = CountVectorizer()
X = countvector.fit_transform(final_df['Preprocessed_Reviews'])

In [23]:
sparse_df = pd.DataFrame(X.toarray(), columns=countvector.get_feature_names())

## Machine Learning

In [71]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(sparse_df, final_df['Sentiment_Positive'], test_size = 0.2, random_state=0)
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train).reshape(-1, 1)
Y_test = np.array(Y_test).reshape(-1, 1)

In [72]:
Y_train.shape

(1030, 1)

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [74]:
logreg = LogisticRegression(C=0.1)

In [75]:
logreg.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [76]:
f1_score(logreg.predict(X_test), Y_test)

0.8333333333333334

In [62]:
np.array(Y_train).reshape(-1, 1)

array([[1],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]], dtype=uint8)

In [54]:
from tensorflow import keras

In [80]:
model = keras.Sequential([
    keras.layers.Dense(1024, activation='relu'),
    keras.layers.Dense(2, activation='sigmoid')
])

In [81]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [82]:
model.fit(x=X_train, y=Y_train, batch_size=16, epochs=5, validation_data=(X_test, Y_test))

Train on 1030 samples, validate on 258 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1499f260e08>