<a href="https://colab.research.google.com/github/TM-M150/Book-Genre-Classification/blob/master/Best_TFIDF-Vectorizer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Import necessary dependancies
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
#Acces personal Google Drive
from google.colab import drive
drive.mount('/content/drive')

#Reads the data
data = pd.read_csv('/content/drive/MyDrive/Data/book32-listing.csv',encoding = "ISO-8859-1")

Mounted at /content/drive


In [4]:
#Assign a list of column names to the columns of the Pandas DataFrame
columns = ['Id', 'Image', 'Image_link', 'Title', 'Author', 'Class', 'Genre']
data.columns = columns

In [5]:
#Create three new DataFrames
books = pd.DataFrame(data['Title'])
author = pd.DataFrame(data['Author'])
genre = pd.DataFrame(data['Genre'])

In [6]:
#Fills the missing values (NaNs)
data['Author'] = data['Author'].fillna('No Book')
data['Title'] = data['Title'].fillna('No Book')

In [7]:
#Output the length
print (len(books))
print (len(genre))

#Output the first two rows
genre.head(2)

#Output the rows with index from 50000 to 50110 
books[50000:50110]

207571
207571


Unnamed: 0,Title
50000,"The Ultimate Sock Puppet Book: Clever Tips, Tr..."
50001,"Puppets, Masks, and Performing Objects"
50002,Felt Board Fingerplays with Patterns & Activit...
50003,Jim Henson and Philosophy: Imagination and the...
50004,Wael Shawky: Cabaret Crusades
...,...
50105,"4G: LTE/LTE-Advanced for Mobile Broadband, Sec..."
50106,Spotlight Synthetic Aperture Radar: Signal Pro...
50107,"Heterogeneous Cellular Networks: Theory, Simul..."
50108,Plain-English Study Guide for the General Radi...


In [8]:
genre['Genre'].unique()

array(['Calendars', 'Comics & Graphic Novels', 'Test Preparation',
       'Mystery, Thriller & Suspense', 'Science Fiction & Fantasy',
       'Romance', 'Humor & Entertainment', 'Literature & Fiction',
       'Gay & Lesbian', 'Engineering & Transportation',
       'Cookbooks, Food & Wine', 'Crafts, Hobbies & Home',
       'Arts & Photography', 'Education & Teaching',
       'Parenting & Relationships', 'Self-Help', 'Computers & Technology',
       'Medical Books', 'Science & Math', 'Health, Fitness & Dieting',
       'Business & Money', 'Law', 'Biographies & Memoirs', 'History',
       'Politics & Social Sciences', 'Reference',
       'Christian Books & Bibles', 'Religion & Spirituality',
       'Sports & Outdoors', 'Teen & Young Adult', "Children's Books",
       'Travel'], dtype=object)

In [9]:
from sklearn.preprocessing import LabelEncoder

feat = ['Genre']
for x in feat:
    le = LabelEncoder()
    le.fit(list(genre[x].values))
    genre[x] = le.transform(list(genre[x]))
    

In [10]:
genre['Genre'].unique()

array([ 3,  6, 30, 19, 26, 24, 15, 17, 12, 11,  8,  9,  0, 10, 20, 27,  7,
       18, 25, 13,  2, 16,  1, 14, 21, 22,  5, 23, 28, 29,  4, 31])

In [11]:
le.inverse_transform([0])[0]

'Arts & Photography'

In [12]:
data['everything'] = pd.DataFrame(data['Title'] + ' ' + data['Author'])
print (data['everything'].head(5))

0         Doug the Pug 2016 Wall Calendar Doug the Pug
1    Moleskine 2016 Weekly Notebook, 12M, Large, Bl...
2    365 Cats Color Page-A-Day Calendar 2016 Workma...
3     Sierra Club Engagement Calendar 2016 Sierra Club
4     Sierra Club Wilderness Calendar 2016 Sierra Club
Name: everything, dtype: object


In [13]:
def change(t):
    t = t.split()
    return ' '.join([(i) for (i) in t if i not in stop])

In [14]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = list(stopwords.words('english'))
stop[:10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [15]:
data['everything'].apply(change)

0                      Doug Pug 2016 Wall Calendar Doug Pug
1         Moleskine 2016 Weekly Notebook, 12M, Large, Bl...
2         365 Cats Color Page-A-Day Calendar 2016 Workma...
3          Sierra Club Engagement Calendar 2016 Sierra Club
4          Sierra Club Wilderness Calendar 2016 Sierra Club
                                ...                        
207566    ADC Map People Washington D.C.: Street Map Boo...
207567    Washington, D.C., Then Now: 69 Sites Photograp...
207568    The Unofficial Guide Washington, D.C. (Unoffic...
207569    Washington, D.C. For Dummies (Dummies Travel) ...
207570    Fodor's Where Weekend Around Boston, 1st Editi...
Name: everything, Length: 207571, dtype: object

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, max_features=70000, strip_accents='unicode',lowercase =True,
                            analyzer='word', token_pattern=r'\w+', use_idf=True, 
                            smooth_idf=True, sublinear_tf=True, stop_words = 'english')
vectors = vectorizer.fit_transform(data['everything'])
vectors.shape

(207571, 58432)

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(vectors, genre['Genre'], test_size=0.02)

In [19]:
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(203419, 58432)
(203419,)
(4152, 58432)
(4152,)


In [20]:
type(books)

pandas.core.frame.DataFrame

## Gaussian NB

In [21]:
clf = MultinomialNB(alpha=.45)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print (metrics.f1_score(y_test, pred, average='macro'))
print (metrics.accuracy_score(y_test, pred))

0.5063239471348133
0.6059730250481695


## Logistic Regression

In [22]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(solver= 'sag',max_iter=200,random_state=450)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print (metrics.f1_score(y_test, pred, average='macro'))
print (metrics.accuracy_score(y_test, pred))

0.6006619462884044
0.6416184971098265


In [23]:
text = ['I too had a Love Story']
text[0] = text[0].lower()
#text = list(text)
s = (vectorizer.transform(text))
#s = vectorizer.fit_transform(df)
print (s.shape)
d = (clf.predict(s))

(1, 58432)


In [24]:
le.inverse_transform(d)[0]

'Biographies & Memoirs'

## Saving Model

In [25]:
import joblib
joblib.dump(clf, '/content/best.pkl')
print("Model saved")

Model saved


In [26]:
clf = joblib.load('/content/best.pkl')

In [27]:
clf

## Neural Network

In [28]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(activation='logistic', alpha=0.00003, batch_size='auto',
                   beta_1=0.9, beta_2=0.999, early_stopping=False,
                   epsilon=1e-08, hidden_layer_sizes=(20,), learning_rate='constant',
                   learning_rate_init=0.003, max_iter=200, momentum=0.9,
                   nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
                   solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
                   warm_start=False)
clf.fit(X_train, y_train) 
pred = clf.predict(X_test)
print (metrics.f1_score(y_test, pred, average='macro'))
print (metrics.accuracy_score(y_test, pred))

0.529182384320618
0.5773121387283237




In [29]:
text = ['Until It Fades: A Novel']
#text = list(text)
s = (vectorizer.transform(text))
#s = vectorizer.fit_transform(df)
print (s.shape)
d = (clf.predict(s))

(1, 58432)


In [30]:
le.inverse_transform(d)[0]

'Science Fiction & Fantasy'

## SVM

In [31]:
import xgboost as xgb
dtrain = xgb.DMatrix(data=X_train, label = y_train)
dtest = xgb.DMatrix(data=X_test)

In [32]:
params = {
    'objective':'multi:softmax',
    'eval_metric':'mlogloss',
    'eta':0.025,
    'max_depth':10,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'min_child_weight':5,
    'num_class': 32
    
}

In [33]:
bst = xgb.cv(params, dtrain, num_boost_round=100,
early_stopping_rounds=40, nfold=5, verbose_eval=10)

[0]	train-mlogloss:3.40296+0.00048	test-mlogloss:3.40409+0.00085
[10]	train-mlogloss:3.05146+0.00142	test-mlogloss:3.06105+0.00284
[20]	train-mlogloss:2.86207+0.00117	test-mlogloss:2.87883+0.00424
[30]	train-mlogloss:2.72828+0.00086	test-mlogloss:2.75091+0.00514
[40]	train-mlogloss:2.62452+0.00063	test-mlogloss:2.65271+0.00522


KeyboardInterrupt: ignored

In [36]:
bst_train = xgb.train(params, dtrain, num_boost_round=10) 

In [37]:
p_test = bst_train.predict(dtest)

In [38]:
print (metrics.f1_score(y_test, p_test, average='macro'))
print (metrics.accuracy_score(y_test, p_test)) 

0.40259500975920753
0.4123314065510597


In [41]:
text = ['Fifty Shades of Grey']
#text = list(text)
s = (vectorizer.transform(text))
#s = vectorizer.fit_transform(df)
print (s.shape)

(1, 58432)


In [40]:
le.inverse_transform(d)[0]

'Science Fiction & Fantasy'