In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
plt.style.use('seaborn')
import plotly.express as px
import random
import sklearn




## Data Class

In [2]:


class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score ==3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
        
    def get_sentiment(self):
        return[x.sentiment for x in self.reviews]

    
    def evenly_distribute(self):
        negative =list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive =list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
    

## Load Data

In [3]:
import json

file_name = 'Books_small_10000.json'

reviews = []

with open (file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[5].text

'I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia\'s trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\'s.'

## Prep Data

In [4]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


## Bag Of Word Vecotorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# This book is amazing!
# This book was awful
vectorizer = TfidfVectorizer()
train_x_vectors =vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)



print(train_x[0])
print(train_x_vectors[0].toarray())

I could barely put this down, actually woke up in the middle of the night and read for an hour till I could no longer keep my eyes open.  Thank you Anne for another gift of your talent!
[[0. 0. 0. ... 0. 0. 0.]]


## Classification

### Linear SVM

In [6]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [8]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [9]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [11]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Evaluation

In [13]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6538461538461539
0.6298076923076923
0.8052884615384616


#### F1 Scores

In [14]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
#f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


array([0.80582524, 0.        , 0.80952381])

### Qualative Testing

In [15]:
test_set = ["I thoroughly enjoyed this, 5 stars", "bad book do not buy", "horrible waste of time"]
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning our Model (Using Grid Search)

In [16]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C':(1,4,8,16,32)}
svc= svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [17]:
print(clf.score(test_x_vectors, test_y))

0.8076923076923077


### Saving Model Using Pickle

In [18]:
import pickle

with open ('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf,f)

### Loading Model Using Pickle

In [19]:
with open('sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [20]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

For my (barely) 9-year-old grandson, who is a &#34;numbers lover.&#34;  He jumped right in and began doing the puzzles--and continued to work on them.  Nice to see paper and pencil getting a workout!


array(['POSITIVE'], dtype='<U8')

### Matplotlib for Visual DataFrames

#### Importing my .csv files

In [21]:
#Using a different data frame for this section
df_books = pd.read_csv('books.csv')
df_books.head()

FileNotFoundError: [Errno 2] No such file or directory: 'books.csv'

In [None]:
df_ratings = pd.read_csv('ratings.csv')
df_ratings.head()

#### Merging my DataFrames

In [None]:
df = pd.merge(df_books,df_ratings, on= 'book_id')

### Bar Graph Using matplotlib

#### Pivot table

In [None]:
table = pd.pivot_table(data = df, index = 'authors', values= 'books_count',aggfunc=np.sum)
table

#### BarPlot (Not at all optimized for continuous data, just an example)

#### This barplot is massive, you can un-hashtag this cell, to run the barplot but it is a massive chunk of data.

In [None]:
#plt.bar(table.index,table['books_count'])
#plt.xticks(rotation=70)
#plt.xlabel('Authors')
#plt.ylabel('Books Count')
#plt.title('Books to Authors')
#plt.show()

In [None]:
###This is a massive dataframe.
### Bar Graphs are not at all optimized for continuos values
###This is only to show that it can be done. 

### New Pivot Table

In [None]:
table = pd.pivot_table(data = df, index = 'original_publication_year', values= 'books_count',aggfunc=np.sum)
table

### Defining Plot

In [None]:
t = df['original_publication_year']
v = df['books_count']


In [None]:
from matplotlib import pyplot as plt

fig,ax = plt.subplots(figsize=(10,20))
ax.plot(t,v)
ax.set_xlabel('Original Publication Year')
ax.set_ylabel('Book Count')
ax.set_title('The relation of Original Publication Year to Book Count.')

### Scatter Plot 

In [None]:
df=pd.read_csv('books.csv')
x = df.original_publication_year
y = df.books_count
plt.scatter(x, y)
plt.show()  

In [None]:
df_books = pd.read_csv('books.csv')
df_books.head(10)

In [None]:
bins = [0,100,200,300,400,500,600,700,800,900,1000]
plt.hist(df_books.books_count, bins = bins)
plt.show()