# Content Based Recommender System

## Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

from mysql.connector import MySQLConnection

## Connecting to MySQL Database

In [2]:
db = MySQLConnection(username = 'root',
                     host = 'localhost',
                     password = 'mysqlNik631197',
                     database = 'books_database')

In [3]:
cursor = db.cursor(buffered = True)

## Importing the table to pandas dataframe

In [4]:
cursor.execute("SELECT * FROM books;")
books = cursor.fetchall()

In [5]:
books

[('002542730x',
  'Politically Correct Bedtime Stories: Modern Tales for Our Life and Times',
  'James Finn Garner',
  1994,
  'John Wiley &amp; Sons Inc',
  'http://images.amazon.com/images/P/002542730X.01.THUMBZZZ.jpg',
  'http://images.amazon.com/images/P/002542730X.01.MZZZZZZZ.jpg',
  'http://images.amazon.com/images/P/002542730X.01.LZZZZZZZ.jpg\r'),
 ('60542128',
  'When the Storm Breaks',
  'Heather Lowell',
  2003,
  'HarperTorch',
  'http://images.amazon.com/images/P/0060542128.01.THUMBZZZ.jpg',
  'http://images.amazon.com/images/P/0060542128.01.MZZZZZZZ.jpg',
  'http://images.amazon.com/images/P/0060542128.01.LZZZZZZZ.jpg\r'),
 ('006092988x',
  'A Tree Grows in Brooklyn',
  'Betty Smith',
  1998,
  'Perennial',
  'http://images.amazon.com/images/P/006092988X.01.THUMBZZZ.jpg',
  'http://images.amazon.com/images/P/006092988X.01.MZZZZZZZ.jpg',
  'http://images.amazon.com/images/P/006092988X.01.LZZZZZZZ.jpg\r'),
 ('60930535',
  'The Poisonwood Bible: A Novel',
  'Barbara Kingsolve

In [6]:
cols = ['ISBN',
        'Book-Title',
        'Book-Author',
        'Year-Of-Publication',
        'Publisher',
        'Image-URL-S',
        'Image-URL-M',
        'Image-URL-L']

In [7]:
df = pd.DataFrame(books, columns = cols)

In [8]:
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,002542730x,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...
1,60542128,When the Storm Breaks,Heather Lowell,2003,HarperTorch,http://images.amazon.com/images/P/0060542128.0...,http://images.amazon.com/images/P/0060542128.0...,http://images.amazon.com/images/P/0060542128.0...
2,006092988x,A Tree Grows in Brooklyn,Betty Smith,1998,Perennial,http://images.amazon.com/images/P/006092988X.0...,http://images.amazon.com/images/P/006092988X.0...,http://images.amazon.com/images/P/006092988X.0...
3,60930535,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...
4,60934417,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...


In [9]:
df.shape

(1581, 8)

## Data Preprocessing

In [10]:
book = df[['ISBN',
        'Book-Title',
        'Book-Author',
        'Year-Of-Publication',
        'Publisher']].copy()

In [11]:
book['Book-Author'] = book['Book-Author'].str.replace(" ", "")

In [12]:
book['Publisher'] = book['Publisher'].str.replace(" ", "")

In [13]:
book.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,002542730x,Politically Correct Bedtime Stories: Modern Ta...,JamesFinnGarner,1994,JohnWiley&amp;SonsInc
1,60542128,When the Storm Breaks,HeatherLowell,2003,HarperTorch
2,006092988x,A Tree Grows in Brooklyn,BettySmith,1998,Perennial
3,60930535,The Poisonwood Bible: A Novel,BarbaraKingsolver,1999,Perennial
4,60934417,Bel Canto: A Novel,AnnPatchett,2002,Perennial


In [14]:
book['Tags'] = book['Book-Author'] + ' ' + book['Year-Of-Publication'].astype(str) + ' ' + book['Publisher']

In [15]:
books = book[['ISBN',
               'Book-Title',
               'Tags']].copy()

In [16]:
books.head()

Unnamed: 0,ISBN,Book-Title,Tags
0,002542730x,Politically Correct Bedtime Stories: Modern Ta...,JamesFinnGarner 1994 JohnWiley&amp;SonsInc
1,60542128,When the Storm Breaks,HeatherLowell 2003 HarperTorch
2,006092988x,A Tree Grows in Brooklyn,BettySmith 1998 Perennial
3,60930535,The Poisonwood Bible: A Novel,BarbaraKingsolver 1999 Perennial
4,60934417,Bel Canto: A Novel,AnnPatchett 2002 Perennial


In [17]:
books['Tags'] = books['Tags'].str.lower()

In [18]:
books.head()

Unnamed: 0,ISBN,Book-Title,Tags
0,002542730x,Politically Correct Bedtime Stories: Modern Ta...,jamesfinngarner 1994 johnwiley&amp;sonsinc
1,60542128,When the Storm Breaks,heatherlowell 2003 harpertorch
2,006092988x,A Tree Grows in Brooklyn,bettysmith 1998 perennial
3,60930535,The Poisonwood Bible: A Novel,barbarakingsolver 1999 perennial
4,60934417,Bel Canto: A Novel,annpatchett 2002 perennial


In [19]:
ps = PorterStemmer()

In [20]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [21]:
books['Tags'] = books['Tags'].apply(stem)

In [22]:
books.head()

Unnamed: 0,ISBN,Book-Title,Tags
0,002542730x,Politically Correct Bedtime Stories: Modern Ta...,jamesfinngarn 1994 johnwiley&amp;sonsinc
1,60542128,When the Storm Breaks,heatherlowel 2003 harpertorch
2,006092988x,A Tree Grows in Brooklyn,bettysmith 1998 perenni
3,60930535,The Poisonwood Bible: A Novel,barbarakingsolv 1999 perenni
4,60934417,Bel Canto: A Novel,annpatchett 2002 perenni


In [23]:
len(books)

1581

## Data Vectorization

In [25]:
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [26]:
vectors = cv.fit_transform(books['Tags']).toarray()

In [27]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
vectors.shape

(1581, 835)

In [29]:
cursor.execute('''
DELETE S1 FROM books AS S1  
INNER JOIN books AS S2   
WHERE S1.title = S2.title;  
''')

## Finding Similarity between each Vectors

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
sim = cosine_similarity(vectors)

In [32]:
sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.2       ,
        0.25819889],
       [0.        , 1.        , 0.        , ..., 0.        , 0.25819889,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.33333333, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.33333333, ..., 1.        , 0.        ,
        0.        ],
       [0.2       , 0.25819889, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.25819889, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [33]:
sim.shape

(1581, 1581)

In [34]:
print(list(sim[0]))

[0.9999999999999999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25819888974716115, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25819888974716115, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25819888974716115, 0.0, 0.0, 0.0, 0.0, 0.25819888974716115, 0.0, 0.0, 0.0, 0.0, 0.0, 0.22360679774997896, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25819888974716115, 0.0, 0.25819888974716115, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25819

In [42]:
def content_recommend(book):
    idx = books[books['Book-Title'] == book].index[0]
    dist = sim[idx]
    top = sorted(list(enumerate(dist)), reverse = True, key = lambda x:x[1])[1:6]
    top = list(map(lambda x : list(df.loc[x[0]][['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']]), top))
    return top

## Recommending Books

In [44]:
content_recommend("When the Storm Breaks")

[['61031410', 'Courting Trouble', 'Lisa Scottoline', 2003],
 ['61009059',
  'One for the Money (Stephanie Plum Novels (Paperback))',
  'Janet Evanovich',
  1995],
 ['142001740', 'The Secret Life of Bees', 'Sue Monk Kidd', 2003],
 ['345413903', 'The Murder Book', 'Jonathan Kellerman', 2003],
 ['375727132',
  "The Dive From Clausen's Pier : A Novel (Vintage Contemporaries (Paperback))",
  'ANN PACKER',
  2003]]

In [45]:
import pickle

pickle.dump(sim, open("Content_Based_Similarities.pkl", "wb"))