In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
from tensorflow.keras.utils import to_categorical

In [2]:
%matplotlib inline
from IPython.display import Image, SVG
import matplotlib.pyplot as plt

In [3]:
results_df = pd.read_csv('results.csv')
books_lists = results_df[['list_id', 'book_id']].drop_duplicates()
books_df = pd.read_csv('books.csv', on_bad_lines='skip')
bl_full = books_lists.merge(books_df, on='book_id')

In [4]:
lists_df = pd.read_csv('lists.csv')
lists_df

Unnamed: 0,list_id,list_name
0,1,Combined Print and E-Book Fiction
1,2,Combined Print and E-Book Nonfiction
2,3,Hardcover Fiction
3,4,Hardcover Nonfiction
4,5,Trade Fiction Paperback
5,6,Paperback Nonfiction
6,7,Advice How-To and Miscellaneous
7,8,Childrens Middle Grade Hardcover
8,9,Picture Books
9,10,Series Books


In [5]:
fiction_ids = [
    1,
    3,
    5,
    8,
    9,
    10,
    11,
    12,
    17,
    18,
    19,
    20,
    21,
    22,
    24,
    25,
    26,
    27,
    28,
    29,
    30,
    42,
    47,
    50,
    51,
    56,
    58,
    59
]

nonfiction_ids = [id for id in lists_df['list_id'] if id not in fiction_ids]

[2,
 4,
 6,
 7,
 13,
 14,
 15,
 16,
 23,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 43,
 44,
 45,
 46,
 48,
 49,
 52,
 53,
 54,
 55,
 57]

In [6]:
def is_fiction(value):
  if value in fiction_ids:
    return 1
  else:
    return 0

bl_full['is_fiction'] = bl_full['list_id'].map(is_fiction)

bl_full.head()

Unnamed: 0,list_id,book_id,book_title,book_image,book_description,author_id,imprint_id,is_fiction
0,1,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1
1,3,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1
2,12,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1
3,5,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1
4,18,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1
...,...,...,...,...,...,...,...,...
22641,59,15666,KYLIE THE CARNIVAL FAIRY,https://storage.googleapis.com/du-prd/books/im...,A fairy tries to find stolen magic hats.,6656,50,1
22642,59,15667,THE CHRONICLES OF NARNIA: PRINCE CASPIAN,https://storage.googleapis.com/du-prd/books/im...,The Pevensies return to Narnia; a movie tie-in.,7236,56,1
22643,9,15668,RETURN TO FAIRYOPOLIS,https://storage.googleapis.com/du-prd/books/im...,A teenage girl regains her belief in a land of...,7237,1577,1
22644,10,15669,WARRIORS: POWER OF THREE,https://storage.googleapis.com/du-prd/books/im...,Cat warriors fight for survival in a mythical ...,473,1582,1


In [7]:
bl_full['title_desc'] = bl_full['book_title'] + " " + bl_full['book_description']
bl_full.head()

Unnamed: 0,list_id,book_id,book_title,book_image,book_description,author_id,imprint_id,is_fiction,title_desc
0,1,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1,WHERE THE CRAWDADS SING In a quiet town on the...
1,3,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1,WHERE THE CRAWDADS SING In a quiet town on the...
2,12,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1,WHERE THE CRAWDADS SING In a quiet town on the...
3,5,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1,WHERE THE CRAWDADS SING In a quiet town on the...
4,18,1,WHERE THE CRAWDADS SING,https://storage.googleapis.com/du-prd/books/im...,In a quiet town on the North Carolina coast in...,1,1,1,WHERE THE CRAWDADS SING In a quiet town on the...
...,...,...,...,...,...,...,...,...,...
22641,59,15666,KYLIE THE CARNIVAL FAIRY,https://storage.googleapis.com/du-prd/books/im...,A fairy tries to find stolen magic hats.,6656,50,1,KYLIE THE CARNIVAL FAIRY A fairy tries to find...
22642,59,15667,THE CHRONICLES OF NARNIA: PRINCE CASPIAN,https://storage.googleapis.com/du-prd/books/im...,The Pevensies return to Narnia; a movie tie-in.,7236,56,1,THE CHRONICLES OF NARNIA: PRINCE CASPIAN The P...
22643,9,15668,RETURN TO FAIRYOPOLIS,https://storage.googleapis.com/du-prd/books/im...,A teenage girl regains her belief in a land of...,7237,1577,1,RETURN TO FAIRYOPOLIS A teenage girl regains h...
22644,10,15669,WARRIORS: POWER OF THREE,https://storage.googleapis.com/du-prd/books/im...,Cat warriors fight for survival in a mythical ...,473,1582,1,WARRIORS: POWER OF THREE Cat warriors fight fo...


In [8]:
df = bl_full[['title_desc', 'is_fiction']]
df = df.drop_duplicates()
df = df.dropna()
df.head()

Unnamed: 0,title_desc,is_fiction
0,WHERE THE CRAWDADS SING In a quiet town on the...,1
5,THE SAVIOR The 17th book in the Black Dagger B...,1
7,A GENTLEMAN IN MOSCOW A Russian count undergoe...,1
10,SUPERMARKET A depressed young man’s secrets ar...,1
12,THE CORNWALLS ARE GONE An Army intelligence of...,1


In [9]:
desc_text = df['title_desc'].astype(str).str.replace('[^\w\s]','')

  desc_text = df['title_desc'].astype(str).str.replace('[^\w\s]','')


In [10]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(desc_text)

X_text = tokenizer.texts_to_sequences(desc_text)

# Pad sequences to a fixed length
max_sequence_length = 350
desc_text_padded = pad_sequences(X_text, maxlen=max_sequence_length, padding='post')

X_text_padded = desc_text_padded

In [11]:
# Target array
y = df['is_fiction']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_text_padded, y, random_state=42)

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_sequence_length))

# These layers return a fixed-length output vector for each example by averaging over the sequence dimension.
nn.add(tf.keras.layers.Dropout(0.2))
nn.add(tf.keras.layers.GlobalAveragePooling1D())

# Hidden layer
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Model summary
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 350, 50)           1051850   
                                                                 
 dropout (Dropout)           (None, 350, 50)           0         
                                                                 
 global_average_pooling1d (G  (None, 50)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                1632      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,053,515
Trainable params: 1,053,515
Non-trainable params: 0
______________________________________________

In [14]:
# Compile and train the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
nn.fit(X_train, y_train, epochs=15)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x794634cb98a0>

In [15]:
# Display predictive accuracy on test data
model_loss, model_accuracy = nn.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

85/85 - 1s - loss: 0.4006 - accuracy: 0.8576 - 520ms/epoch - 6ms/step
Loss: 0.4005584418773651, Accuracy: 0.8575678467750549
