In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 


In [2]:
books = pd.read_csv('books_new.csv')


In [3]:
books.head()

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
0,Fundamentals of Wavelets,"Goswami, Jaideva",tech,signal_processing,228,Wiley
1,Data Smart,"Foreman, John",tech,data_science,235,Wiley
2,God Created the Integers,"Hawking, Stephen",tech,mathematics,197,Penguin
3,Superfreakonomics,"Dubner, Stephen",science,economics,179,HarperCollins
4,Orientalism,"Said, Edward",nonfiction,history,197,Penguin


In [4]:
books.shape

(211, 6)

In [5]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      211 non-null    object
 1   Author     187 non-null    object
 2   Genre      211 non-null    object
 3   SubGenre   211 non-null    object
 4   Height     211 non-null    int64 
 5   Publisher  115 non-null    object
dtypes: int64(1), object(5)
memory usage: 10.0+ KB


In [6]:
books['Author'].fillna('Unknown', inplace=True)
books['Publisher'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books['Author'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books['Publisher'].fillna('Unknown', inplace=True)


In [28]:
books[books['Author'] == 'Unknown']

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher,Description
80,Beyond Degrees,Unknown,philosophy,education,222,HarperCollins,philosophy education Unknown 222
83,"World's Greatest Trials, The",Unknown,nonfiction,history,210,Unknown,nonfiction history Unknown 210
103,"World's Greatest Short Stories, The",Unknown,fiction,classic,217,Jaico,fiction classic Unknown 217
113,Selected Short Stories,Unknown,fiction,classic,215,Jaico,fiction classic Unknown 215
118,Karl Marx Biography,Unknown,nonfiction,autobiography,162,Unknown,nonfiction autobiography Unknown 162
129,Political Philosophers,Unknown,philosophy,politics,162,Unknown,philosophy politics Unknown 162
138,Final Crisis,Unknown,fiction,comic,257,Unknown,fiction comic Unknown 257
139,"Killing Joke, The",Unknown,fiction,comic,283,Unknown,fiction comic Unknown 283
140,Flashpoint,Unknown,fiction,comic,265,Unknown,fiction comic Unknown 265
141,Batman Earth One,Unknown,fiction,comic,265,Unknown,fiction comic Unknown 265


In [8]:
books[books['Author'].isnull()]

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher


In [9]:
books = books.dropna(subset=['Author'])


In [10]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      211 non-null    object
 1   Author     211 non-null    object
 2   Genre      211 non-null    object
 3   SubGenre   211 non-null    object
 4   Height     211 non-null    int64 
 5   Publisher  211 non-null    object
dtypes: int64(1), object(5)
memory usage: 10.0+ KB


In [11]:
books[books['Title'].duplicated()]

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
195,Angels & Demons,"Brown, Dan",fiction,novel,170,Unknown


In [12]:
books = books[~books.duplicated(subset=['Title'],keep=False)]

In [13]:
books.shape

(209, 6)

In [14]:
books.head()

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
0,Fundamentals of Wavelets,"Goswami, Jaideva",tech,signal_processing,228,Wiley
1,Data Smart,"Foreman, John",tech,data_science,235,Wiley
2,God Created the Integers,"Hawking, Stephen",tech,mathematics,197,Penguin
3,Superfreakonomics,"Dubner, Stephen",science,economics,179,HarperCollins
4,Orientalism,"Said, Edward",nonfiction,history,197,Penguin


In [15]:
books['Author']= books['Author'].to_list()

In [16]:
books['Genre']= books['Genre'].to_list()

In [17]:
books['SubGenre']= books['SubGenre'].to_list()

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [19]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books['Genre'] + ' ' + books['SubGenre'] + ' ' + books['Author'])

In [20]:
books.head()

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
0,Fundamentals of Wavelets,"Goswami, Jaideva",tech,signal_processing,228,Wiley
1,Data Smart,"Foreman, John",tech,data_science,235,Wiley
2,God Created the Integers,"Hawking, Stephen",tech,mathematics,197,Penguin
3,Superfreakonomics,"Dubner, Stephen",science,economics,179,HarperCollins
4,Orientalism,"Said, Edward",nonfiction,history,197,Penguin


In [39]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

def create_cosine_similarity_matrix(df):
    # Combine the features into a single description
    df['Description'] = df['Genre'] + ' ' + df['SubGenre'] + ' ' + df['Author'] + ' ' + df['Height'].astype(str)
    
    # Initialize the TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the description
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['Description'])
    
    # Compute the cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return cosine_sim


In [42]:
def recommend_books(df, cosine_sim, genre, subgenre, author, height):
    # Get the indices of the books matching the specified criteria
    matching_indices = df[(df['Genre'] == genre) & (df['SubGenre'] == subgenre) & (df['Author'] == author) & (df['Height'] == height)].index
    
    if len(matching_indices) == 0:
        print("No books found matching the specified criteria.")
        return None
    
    # Get the index of the first matching book
    book_index = matching_indices[0]
    
    # Get the similarity scores with other books
    sim_scores = list(enumerate(cosine_sim[book_index]))
    
    # Sort the books based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top recommendations (excluding the book itself)
    top_recommendations = sim_scores[1:6]
    
    # Get the indices of the recommended books
    recommended_indices = [i[0] for i in top_recommendations]
    
    # Return the recommended books
    return df.iloc[recommended_indices][['Title', 'Author']]


In [43]:
# Assuming 'books' is your DataFrame with the provided columns
# Create the cosine similarity matrix
cosine_sim_matrix = create_cosine_similarity_matrix(books)

# Call the function to get recommendations



In [44]:
cosine_sim_matrix

array([[1.        , 0.07510286, 0.0764371 , ..., 0.        , 0.        ,
        0.        ],
       [0.07510286, 1.        , 0.08759945, ..., 0.        , 0.        ,
        0.        ],
       [0.0764371 , 0.08759945, 1.        , ..., 0.        , 0.14121914,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16372165,
        0.15252844],
       [0.        , 0.        , 0.14121914, ..., 0.16372165, 1.        ,
        0.1556766 ],
       [0.        , 0.        , 0.        , ..., 0.15252844, 0.1556766 ,
        1.        ]])

In [47]:
recommendations = recommend_books(books, cosine_sim_matrix, 'philosophy', 'education', 'Unknown', 222)
print(recommendations)


                              Title          Author
76   Radiowaril Bhashane & Shrutika  Deshpande, P L
77                  Gun Gayin Awadi  Deshpande, P L
78                     Aghal Paghal  Deshpande, P L
122                          Apulki  Deshpande, P L
159                Vyakti ani Valli  Deshpande, P L


In [None]:
# Save the recommendations DataFrame into a pickle file
recommendations.to_pickle('recommendations.pkl','wb')

# Save the cosine similarity matrix into a pickle file
with open('cosine_similarity_matrix.pkl', 'wb') as f:
    pickle.dump(cosine_sim_matrix, f)


In [31]:
# Example usage
recommended_books = recommend_books(books, '', '', '', )
print(recommended_books)

                              Title          Author       Genre SubGenre  \
76   Radiowaril Bhashane & Shrutika  Deshpande, P L  nonfiction     misc   
77                  Gun Gayin Awadi  Deshpande, P L  nonfiction     misc   
78                     Aghal Paghal  Deshpande, P L  nonfiction     misc   
122                          Apulki  Deshpande, P L  nonfiction     misc   
159                Vyakti ani Valli  Deshpande, P L  nonfiction     misc   

     Height Publisher                         Description  
76      213      Mauj  nonfiction misc Deshpande, P L 213  
77      212      Mauj  nonfiction misc Deshpande, P L 212  
78      212      Mauj  nonfiction misc Deshpande, P L 212  
122     211   Unknown  nonfiction misc Deshpande, P L 211  
159     211   Unknown  nonfiction misc Deshpande, P L 211  


In [33]:
import pickle 

In [None]:
pickle.dump(books,open('books.pkl','wb'))