# Import Library & Get Data

In [24]:
import re
import os
import csv
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

print(tf.__version__)

2.5.0


In [4]:
# Upload your kaggle.json to colab main directory
! chmod 600 kaggle.json && (ls ~/.kaggle 2>/dev/null || mkdir ~/.kaggle) && mv kaggle.json ~/.kaggle/ && echo 'Done'

Done


In [5]:
!kaggle datasets download arashnic/book-recommendation-dataset
!unzip -o book-recommendation-dataset.zip
!ls

Downloading book-recommendation-dataset.zip to /content
 71% 17.0M/23.8M [00:00<00:00, 20.5MB/s]
100% 23.8M/23.8M [00:01<00:00, 24.6MB/s]
Archive:  book-recommendation-dataset.zip
  inflating: Books.csv               
  inflating: Ratings.csv             
  inflating: Users.csv               
book-recommendation-dataset.zip  Ratings.csv	      sample_data
Books.csv			 regression_model.h5  Users.csv


# Preprocessing

In [6]:
# fix column name
def columnfix(columnlist):
  columnlist = list(columnlist)
  for i in range(len(columnlist)):
    columnlist[i] = columnlist[i].lower()
    columnlist[i] = columnlist[i].translate(str.maketrans("","",string.punctuation))

  return columnlist

### Rating Section

In [7]:
df_rate = pd.read_csv('Ratings.csv')
df_rate.columns = columnfix(df_rate.columns)   # fix column name
df_rate

Unnamed: 0,userid,isbn,bookrating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [8]:
newrate = df_rate.groupby('isbn').agg(         # Create new dataframe group by isbn and have column from mean rating
  rating=("bookrating", "mean")
)

# newrate.reset_index(inplace=True)
newrate

Unnamed: 0_level_0,rating
isbn,Unnamed: 1_level_1
0330299891,3.0
0375404120,1.5
0586045007,0.0
9022906116,3.5
9032803328,0.0
...,...
cn113107,0.0
ooo7156103,7.0
§423350229,0.0
´3499128624,8.0


### Book Section

In [9]:
df_book = pd.read_csv('Books.csv')
df_book.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [10]:
df_book.columns = columnfix(df_book.columns)                  # fix column name

df_book = df_book.join(newrate, on="isbn")                    # join df_book with newrate

df_book['rating'] = df_book['rating'].fillna(0)               # fill NaN rating with 0

df_book = df_book[df_book['booktitle'].notna()]               # remove data with NaN book title

df_book.insert(loc=0, column='bookid', value=df_book.index)   # add bookid in first column

df_book = df_book.replace(",", "", regex=True)                # remove comma

df_book

Unnamed: 0,bookid,isbn,booktitle,bookauthor,yearofpublication,publisher,imageurls,imageurlm,imageurll,rating
0,0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,0.000000
1,1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,4.928571
2,2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,5.000000
3,3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,4.272727
4,4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,0.000000
...,...,...,...,...,...,...,...,...,...,...
271355,271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,7.000000
271356,271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,4.000000
271357,271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,0.000000
271358,271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,0.000000


In [11]:
df_book.to_csv('newbook.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

### User Section

In [12]:
df_user = pd.read_csv('Users.csv')
df_user.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [13]:
df_user = df_user.fillna(0)                   # fill NaN age with zero

df_user.columns = columnfix(df_user.columns)  # fix column name

df_user

Unnamed: 0,userid,location,age
0,1,"nyc, new york, usa",0.0
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",0.0
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",0.0
...,...,...,...
278853,278854,"portland, oregon, usa",0.0
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",0.0
278856,278857,"knoxville, tennessee, usa",0.0


In [14]:
df_user.to_csv('newuser.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

### Rating Section (2)

In [15]:
df_isbn_id = df_book[["isbn", "bookid"]].copy()
df_isbn_id = df_isbn_id.set_index('isbn')
df_isbn_id

Unnamed: 0_level_0,bookid
isbn,Unnamed: 1_level_1
0195153448,0
0002005018,1
0060973129,2
0374157065,3
0393045218,4
...,...
0440400988,271355
0525447644,271356
006008667X,271357
0192126040,271358


In [16]:
df_rate = df_rate.join(df_isbn_id, on="isbn")
df_rate = df_rate[df_rate['bookid'].notna()]
df_rate['bookid'] = df_rate['bookid'].astype(int)
df_rate

Unnamed: 0,userid,isbn,bookrating,bookid
0,276725,034545104X,0,2966
1,276726,0155061224,5,225816
2,276727,0446520802,0,11053
3,276729,052165615X,3,246838
4,276729,0521795028,6,246839
...,...,...,...,...
1149774,276704,0876044011,0,69543
1149775,276704,1563526298,9,69544
1149776,276706,0679447156,0,52540
1149777,276709,0515107662,10,15978


In [17]:
df_rate.to_csv('newrating.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

# Recommendation Model

### Create Model

In [18]:
print('books length : {}'.format(len(df_rate.bookid)))
print('users length : {}'.format(len(df_rate.userid)))
print('unique books length : {}'.format(len(df_rate.bookid.unique())))
print('unique users length : {}'.format(len(df_rate.userid.unique())))

books length : 1031136
users length : 1031136
unique books length : 270151
unique users length : 92106


In [42]:
book_data = np.array(list(set(df_book.bookid)))
user_data = np.array([85 for i in range(len(book_data))])
length = len(book_data)

print('book data :')
print(book_data[:15])

print('user data :')
print(user_data[:15])

print('length : {}'.format(length))

book data :
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
user data :
[85 85 85 85 85 85 85 85 85 85 85 85 85 85 85]
length : 271360


In [20]:
book_embedding_size = 8
user_embedding_size = 8

user_input = tf.keras.Input(shape=(1,), name='User-Input')
book_input = tf.keras.Input(shape=(1,), name='Book-Input')

user_embedded = tf.keras.layers.Embedding(df_rate.userid.max()+1, user_embedding_size, input_length=1, name='User-Embedding')(user_input)
book_embedded = tf.keras.layers.Embedding(df_rate.bookid.max()+1, book_embedding_size, input_length=1, name='Book-Embedding')(book_input)

concatenated = tf.keras.layers.Concatenate()([user_embedded, book_embedded])

out = tf.keras.layers.Flatten()(concatenated)
out = tf.keras.layers.Dense(128, activation='relu')(out)
out = tf.keras.layers.Dense(32, activation='relu')(out)
out = tf.keras.layers.Dense(1, activation='linear', name='prediction')(out)

model = tf.keras.Model(
    inputs = [user_input, book_input],
    outputs = out,
)
model.summary(line_length=88)

Model: "model"
________________________________________________________________________________________
Layer (type)                 Output Shape       Param #   Connected to                  
User-Input (InputLayer)      [(None, 1)]        0                                       
________________________________________________________________________________________
Book-Input (InputLayer)      [(None, 1)]        0                                       
________________________________________________________________________________________
User-Embedding (Embedding)   (None, 1, 8)       2230840   User-Input[0][0]              
________________________________________________________________________________________
Book-Embedding (Embedding)   (None, 1, 8)       2170880   Book-Input[0][0]              
________________________________________________________________________________________
concatenate (Concatenate)    (None, 1, 16)      0         User-Embedding[0][0]          
      

In [21]:
model.compile(
    tf.optimizers.Adam(0.005),
    loss='MSE',
    metrics=['MAE']
)

### Train Data

In [22]:
train, test = train_test_split(df_rate, test_size=0.2, random_state=42)

In [25]:
if os.path.exists('regression_model.h5'):
  model2 = tf.keras.models.load_model('regression_model.h5')
else:
  # history = model.fit([train.userid, train.bookid], train.bookrating, epochs=10, verbose=1)
  history = model.fit([df_rate.userid, df_rate.bookid], df_rate.bookrating, epochs=5, verbose=1)
  model.save('regression_model.h5')
  plt.plot(history.history['loss'])
  plt.xlabel("Epochs")
  plt.ylabel("Training Error")

In [26]:
  model.evaluate([test.userid, test.bookid], test.bookrating)



[22.84319305419922, 2.837311029434204]

### Predict Data

In [43]:
predictions = model.predict([user_data, book_data])
predictions.shape

(271360, 1)

In [44]:
predictions[:10]

array([[0.00692733],
       [0.01071682],
       [0.01997386],
       [0.01559203],
       [0.00232491],
       [0.00821089],
       [0.01594045],
       [0.01135223],
       [0.00214908],
       [0.00052661]], dtype=float32)

In [45]:
predictions = np.array([a[0] for a in predictions])
predictions[:25]

array([ 0.00692733,  0.01071682,  0.01997386,  0.01559203,  0.00232491,
        0.00821089,  0.01594045,  0.01135223,  0.00214908,  0.00052661,
        0.00973466,  0.00762339,  0.00092916,  0.02655368,  0.00962452,
        0.00436051,  0.00890927, -0.00081995,  0.00627706,  0.00862658,
        0.00820902,  0.00685372,  0.02446755,  0.0182433 ,  0.00568684],
      dtype=float32)

In [46]:
recommended_book_id = (-predictions).argsort()    # Sorting + ambil 5 data terbesar
recommended_book_id[:25]

array([229734,  75757, 263973, 247159, 146699,  20125, 201424, 214050,
         8512,  94369, 243887, 216309,  36692, 237210,  15416,  29221,
        32220,  22419,  90076, 141051,  56141,  84626, 253031, 226254,
        62919])

In [47]:
predictions[recommended_book_id][:25]

array([0.04502257, 0.04280353, 0.04253614, 0.04171232, 0.0410908 ,
       0.04068025, 0.04055754, 0.04031461, 0.0401352 , 0.04012797,
       0.04007761, 0.03999387, 0.03985026, 0.03979322, 0.03963859,
       0.03954826, 0.03946888, 0.03927473, 0.0386938 , 0.03863627,
       0.03856922, 0.03845753, 0.03827792, 0.03825669, 0.03823726],
      dtype=float32)

In [48]:
df_book[df_book['bookid'].isin(recommended_book_id)]

Unnamed: 0,bookid,isbn,booktitle,bookauthor,yearofpublication,publisher,imageurls,imageurlm,imageurll,rating
0,0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,0.000000
1,1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,4.928571
2,2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,5.000000
3,3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,4.272727
4,4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,0.000000
...,...,...,...,...,...,...,...,...,...,...
271355,271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,7.000000
271356,271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,4.000000
271357,271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,0.000000
271358,271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,0.000000


### Prediction Table

In [49]:
new_df = pd.DataFrame()
new_df['user_data'] = user_data
new_df['book_data'] = book_data
new_df['predictions'] = predictions
new_df['predictsort'] = list(predictions[recommended_book_id])
new_df['recom_book'] = list(recommended_book_id)
new_df[:10]

Unnamed: 0,user_data,book_data,predictions,predictsort,recom_book
0,85,0,0.006927,0.045023,229734
1,85,1,0.010717,0.042804,75757
2,85,2,0.019974,0.042536,263973
3,85,3,0.015592,0.041712,247159
4,85,4,0.002325,0.041091,146699
5,85,5,0.008211,0.04068,20125
6,85,6,0.01594,0.040558,201424
7,85,7,0.011352,0.040315,214050
8,85,8,0.002149,0.040135,8512
9,85,9,0.000527,0.040128,94369


### Deploy Model

In [51]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)

In [52]:
tfmodel = converter.convert()



INFO:tensorflow:Assets written to: /tmp/tmpnjuo0xbc/assets


INFO:tensorflow:Assets written to: /tmp/tmpnjuo0xbc/assets


In [53]:
open('model.tflite', 'wb').write(tfmodel)

17634272

# Photo Detection

Unused Photo Detection Code using PyTesseract

### Install & Import

In [54]:
!apt install subversion
!apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libapr1 libaprutil1 libserf-1-1 libsvn1
Suggested packages:
  db5.3-util libapache2-mod-svn subversion-tools
The following NEW packages will be installed:
  libapr1 libaprutil1 libserf-1-1 libsvn1 subversion
0 upgraded, 5 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,237 kB of archives.
After this operation, 9,910 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libapr1 amd64 1.6.3-2 [90.9 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libaprutil1 amd64 1.6.1-2 [84.4 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libserf-1-1 amd64 1.3.9-6 [44.4 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libsvn1 amd64 1.9.7-4ubuntu1 [1,183 kB]
Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 subversion amd64 1.9.7-4ubuntu1 

In [55]:
import pytesseract
try:
  from PIL import Image
except ImportError:
  import Image

In [56]:
!svn checkout https://github.com/Turing-Team-B21-CAP0257/ML/trunk/ImageTest

A    ImageTest/CM-1.jpg
A    ImageTest/CM-2.jpg
A    ImageTest/CM-3.jpg
A    ImageTest/CM-4.jpg
A    ImageTest/DN-1.jpg
A    ImageTest/DN-2.jpg
A    ImageTest/DN-3.jpg
A    ImageTest/GM-1.jpg
A    ImageTest/GM-2.jpg
A    ImageTest/H-1.jpeg
A    ImageTest/H-2.jpg
A    ImageTest/LV-1.jpg
A    ImageTest/LV-2.jpg
A    ImageTest/TDO.jpg
A    ImageTest/WTD-1.jpg
A    ImageTest/WTD-2.jpg
A    ImageTest/WTD-3.jpg
Checked out revision 6.


### Image to Text

In [57]:
image_path = 'ImageTest/CM-1.jpg'
text = pytesseract.image_to_string(Image.open(image_path))
print(text)

classical

Mythology

Seventh Edition

MARK P.0. MORFORD @ ROBERT J. LENARDON

 



In [58]:
text = re.sub(r'\n+', " ", text)
text = text.lower()
text = text.translate(str.maketrans("","",string.punctuation))

text = text.lstrip()
text = text.rstrip()
print(text)

classical mythology seventh edition mark p0 morford  robert j lenardon


In [59]:
text = text.split(" ")
print(text)

['classical', 'mythology', 'seventh', 'edition', 'mark', 'p0', 'morford', '', 'robert', 'j', 'lenardon']


### Search Data from Text

In [60]:
bookData = pd.DataFrame(df_book, columns= ['booktitle', 'bookauthor']).values.tolist()
bookData[:5]

[['Classical Mythology', 'Mark P. O. Morford'],
 ['Clara Callan', 'Richard Bruce Wright'],
 ['Decision in Normandy', "Carlo D'Este"],
 ['Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
  'Gina Bari Kolata'],
 ['The Mummies of Urumchi', 'E. J. W. Barber']]

In [61]:
for i in range(len(bookData)):
  bookData[i] = str(bookData[i][0]) + " " + str(bookData[i][1])
  bookData[i] = bookData[i].lower()
  bookData[i] = bookData[i].translate(str.maketrans("","",string.punctuation))

bookData[:10]

['classical mythology mark p o morford',
 'clara callan richard bruce wright',
 'decision in normandy carlo deste',
 'flu the story of the great influenza pandemic of 1918 and the search for the virus that caused it gina bari kolata',
 'the mummies of urumchi e j w barber',
 'the kitchen gods wife amy tan',
 'what if the worlds foremost military historians imagine what might have been robert cowley',
 'pleading guilty scott turow',
 'under the black flag the romance and the reality of life among the pirates david cordingly',
 'where youll find me and other stories ann beattie']

In [62]:
for i in range(len(bookData)):
  bookData[i] = bookData[i].split()

bookData[:3]

[['classical', 'mythology', 'mark', 'p', 'o', 'morford'],
 ['clara', 'callan', 'richard', 'bruce', 'wright'],
 ['decision', 'in', 'normandy', 'carlo', 'deste']]

In [63]:
similarityList = []

index = 0

for book in bookData:
  similarity = 0
  for word in text:
    if word in book:
      similarity += 1
  similarityList.append([index, similarity])
  index += 1

similarityList[:10]

[[0, 4],
 [1, 0],
 [2, 0],
 [3, 0],
 [4, 1],
 [5, 0],
 [6, 1],
 [7, 0],
 [8, 0],
 [9, 0]]

In [64]:
similarityList.sort(key=lambda x: x[1], reverse=True)
similarityList[:20]

[[0, 4],
 [95231, 4],
 [111977, 4],
 [193923, 4],
 [42118, 3],
 [124762, 3],
 [127454, 3],
 [158903, 3],
 [196622, 3],
 [241341, 3],
 [264304, 3],
 [1087, 2],
 [1763, 2],
 [3179, 2],
 [3472, 2],
 [5008, 2],
 [7116, 2],
 [10299, 2],
 [10317, 2],
 [10663, 2]]

In [65]:
output = [i[0] for i in similarityList[:4]]
output

[0, 95231, 111977, 193923]