# Text Classifier on Good Read Reviews

In [47]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
 good_reads = pd.read_csv('goodreads.tsv' , sep='\t') #names=['user_id','song_id','play_count'])

In [3]:
good_reads.head()

Unnamed: 0,title,date,rating,body
0,The Body: A Guide for Occupants,"Oct 11, 2019",5.0,Lovingly presented with humour and kindness an...
1,Resistance Reborn,"Feb 28, 2020",4.0,I read through this book quicker than anticip...
2,The Book of Two Ways,"Jul 13, 2020",4.0,Jodi Picoult does it again in this new novel. ...
3,How to Write One Song: Loving the Things We Cr...,"Feb 21, 2021",4.0,I’m a music freak without a shred of musical a...
4,The Giver of Stars,"Sep 08, 2020",4.0,The Giver of Stars by Jojo Moyes won the Book ...


In [4]:
good_reads.shape

(8433, 4)

In [5]:
#Checking For Null Values
good_reads.isnull().sum()

title       0
date        4
rating    290
body        4
dtype: int64

In [6]:
#Dropping Null Values
good_reads=good_reads.dropna()

In [7]:
good_reads.shape

(8139, 4)

In [8]:
#Chaning review data of every row into string type
for i in range(0,len(good_reads)-1):
    if type(good_reads.iloc[i]['body']) != str:
        good_reads.iloc[i]['body'] = str(good_reads.iloc[i]['body'])

In [9]:
#3 is in the middle. It’s neither good nor bad. Just average. 
#But we want to classify good or bad reviews. So, I decided to get rid of all the 3’s

good_reads = good_reads[good_reads['rating'] != 3]

In [10]:
good_reads.shape

(6709, 4)

In [12]:
#function to represent sentiment 0(negeative);1(positive)
def sentiment(n):
    return 1 if n>=4 else 0

In [13]:
#Applying Sentiment Function to our Data Frame
good_reads['sentiment'] = good_reads['rating'].apply(sentiment)

In [14]:
good_reads.tail()

Unnamed: 0,title,date,rating,body,sentiment
8428,The Giver of Stars,"Feb 12, 2020",5.0,I loved this book! Let me start by saying that...,1
8429,Minor Feelings: An Asian American Reckoning,"May 21, 2020",4.0,What did i just read?I chose this title becaus...,1
8430,Trixie and Katya's Guide to Modern Womanhood,"Oct 29, 2020",4.0,**3.45 stars ( if we were using a 10/10 scale ...,1
8431,To Wake the Giant: A Novel of Pearl Harbor,"May 22, 2020",4.0,"Thanks to Netgalley, Random House and Ballenti...",1
8432,Finding Ashley,"Apr 14, 2021",4.0,Finding Ashley starts with Melissa working har...,1


In [16]:
#Spiliting Target and Features
X = good_reads['body']
y = good_reads['sentiment']

In [17]:
#Spilting Test and Train data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=0)

## Data Cleaning

In [18]:
# Applyling some pre-processing on the given text.
import re


def clean_text(text):
    """
    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

## Count Vectorizer

In [19]:
#Using CountVectorizer to create bag of words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words="english",preprocessor=clean_text)

In [20]:
#Fitting Data into Count Vectorizer
X_train_CV = vectorizer.fit_transform(X_train)
X_test_CV = vectorizer.transform(X_test)

In [21]:
X_train_CV

<5367x25831 sparse matrix of type '<class 'numpy.int64'>'
	with 323886 stored elements in Compressed Sparse Row format>

## Implementing SGD Classifier - Logistic

In [23]:
#Fitting Classifier Model
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="log", penalty="l2") #logistic Regression
clf.fit(X_train_CV, y_train)

SGDClassifier(loss='log')

In [24]:
#Predicting on X_test
y_pred  =  clf.predict(X_test_CV)

In [25]:
y_pred 

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [26]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cr = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [32]:
print('Accuracy of our Model is : ',ac*100)

Accuracy of our Model is :  89.64232488822653


## TF-IDF

In [54]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

In [34]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_CV)
X_test_tfidf = tfidf_transformer.transform(X_test_CV)

## Implementing SGD Classifier -SVM

In [35]:
clf = SGDClassifier(loss="hinge", penalty="l2") #svm
clf.fit(X_train_tfidf, y_train)

SGDClassifier()

In [36]:
#Predicting on X_test
y_pred  =  clf.predict(X_test_tfidf)

In [37]:
y_pred 

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

In [38]:
ac = accuracy_score(y_test,y_pred)

In [39]:
print('Accuracy of our Model is : ',ac*100)

Accuracy of our Model is :  91.72876304023845


In [None]:
#We can Observe increase in 2 % of accuracy

In [40]:
y_train_pred  =  clf.predict(X_train_tfidf)

In [41]:
ac = accuracy_score(y_train,y_train_pred)

In [42]:
print('Accuracy of our Model is : ',ac*100)

Accuracy of our Model is :  99.29196944289174


We can observe that our model is a overfitting lets try to resolve it by implementing **GRID SEARCH** Cross Validation

## Grid Search CV

In [48]:
from sklearn.model_selection import GridSearchCV
params = {
    "loss" : ["hinge", "log", "squared_hinge", "modified_huber", "perceptron"],
    "alpha" : [0.0001, 0.001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "elasticnet", "none"],
}

clf = SGDClassifier(max_iter=1000)
grid = GridSearchCV(clf, param_grid=params, cv=10)


grid.fit(X_train_tfidf, y_train)

print(grid.best_params_) 

{'alpha': 0.001, 'loss': 'hinge', 'penalty': 'none'}


In [50]:
#Testing Data

grid_predictions = grid.predict(X_test_tfidf) 

print('Accuracy: {:.2f}'.format(accuracy_score(y_test, grid_predictions)*100))

Accuracy: 90.54


In [51]:
#Training Data 

grid_train_predictions = grid.predict(X_train_tfidf) 

print('Accuracy: {:.2f}'.format(accuracy_score(y_train, grid_train_predictions)*100))

Accuracy: 97.63


In [None]:
# Even after impleting cv we are getting over fit model lets try implementing RNN