In [2]:
#importing all the necessary libraries required to create the model

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


#Load the dataset
books = pd.read_csv('./Dataset/books.csv')
books_new = pd.read_csv('./Dataset/books_new.csv')

#checking the loaded dataset1
print(books)

#checking the loaded dataset2
print(books_new)

# Merging both the datasets 
df = pd.concat([books,books_new])

df

# Checking For null Values
df.isna().sum()

# Checking For data distribution and some statistics
df.describe()

#dropping duplicate rows if any
df.drop_duplicates(inplace=True)

# Checking whether there were any duplicates or not
df.shape

# Filling Null Values with Unknown Value
df['SubGenre'] = df['SubGenre'].fillna('Unknown')

df

# Filling Null Values with Unknown Value
df['Author'] = df['Author'].fillna('Unknown Author')

# Filling Null Values with Unknown Value
df['Publisher'] = df['Publisher'].fillna('Unknown Publisher')

df

# Checking if all the null values are filled
df.info()

#copying data into another variable to try some ideas
df1 = df.copy()

# Trying to fill SubGenre Null values to a predicted subgenre value


#label endoding the categorical fields
labelencoders = {}
categorical_columns = ['Title','Author','Genre','Publisher','SubGenre']

for columns in categorical_columns:
    labelencoders[columns] = LabelEncoder()
    df1[columns] = labelencoders[columns].fit_transform(df1[columns])
    
# creating different variables to distinguish null valued subgenre rows and non-null value rows    
books_unknown = df1[df1.SubGenre ==0]
books_known = df1[df1['SubGenre']!=0]

#label encoding subgenre column in known subgenre variable
subgenre_encoder = LabelEncoder()
subgenre_endocded = subgenre_encoder.fit_transform(books_known['SubGenre'])

# preparing training and testing data for training and testing
X = books_known.drop(columns = ['SubGenre'])
y = books_known['SubGenre']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=5)

# training the model
clf = RandomForestClassifier()
clf.fit(X_train,y_train)

#testing the model
X_unknown = books_unknown.drop(columns=['SubGenre'])

books_unknown['SubGenre'] = subgenre_encoder.inverse_transform(clf.predict(X_unknown))

# Combined the datasets back
df2 = pd.concat([books_known, books_unknown])

#converted back the categorical features to original labels
for column in categorical_columns:
    df2[column] = labelencoders[column].inverse_transform(df2[column])

# Output of cleaned dataset
print(df2.head())

# printing the new dataframe
df2

df2['GenreCombined'] = df2['Genre'] + ',' + df2['SubGenre']

df2

# Again Copying the dataset

df3 = df2.copy()
df3

# combining Genre and SubGenre Columns
df3['GenreCombined'].replace(r',Unknown',r'',regex=True,inplace=True)

df3

# Dropping Genre and SubGenre columns because they are not needed anymore
df3.drop(columns=['Genre','SubGenre'],inplace=True)

df3.to_csv('./Dataset/Books_merged.csv',)

df3

# # Used TF-IDF Vectorizer to convert genre_combined into a matrix of token counts
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df3['GenreCombined'])

# finding similarity between all books based on Genre
similarity= cosine_similarity(tfidf_matrix,tfidf_matrix)

print(list(enumerate(similarity)))

# Function to get book recommendations based on genre and sub-genre
def get_recommendations(title, cosine_sim=similarity):
    # Get the index of the book that matches the title
    index = df3[df3['Title'] == title].index[0]

    # Get the pairwise similarity scores of all books with that book
    similarity_scores = list(enumerate(cosine_sim[index]))

    # Sort the books based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the 10 most similar books
    similarity_indices = [i[0] for i in similarity_scores[1:8]]

    # Return the top 10 most similar books
    return df3.iloc[similarity_indices]


# Testing the model

recommendation = get_recommendations('Data Smart')

recommendation

recommendation2 = get_recommendations('Orientalism')
recommendation2

recommendation3 = get_recommendations('Fundamentals of Wavelets')
recommendation3

genre_books = df3[df3['GenreCombined'].str.contains('data_science', case=False)]


def get_genre_recommendations(input_genre):
    # Filter books based on the input genre
    genre_books = df3[df3['GenreCombined'].str.contains(input_genre, case=False)]
    
    # Check if the genre exists in the dataset
    if genre_books.empty:
        print(f"No books found for the genre '{input_genre}'.")
        return pd.DataFrame()
    
    # Get indices of books that match the input genre
    genre_indices = genre_books.index
    
    index_list = genre_books.index

    Title = df3.iloc[index_list[0]]['Title']
    
    recommendations = get_recommendations(Title)
    
    return recommendations

# Example: Get recommendations for a specific genre
input_genre = 'data_science'
recommendations = get_genre_recommendations(input_genre)
recommendations
# Display the recommended books if found
# if not recommendations.empty:
#     for i in recommendations:
#         print(i)

# Encode the Author's name
label_encoder = LabelEncoder()
df3['Author_encoded'] = label_encoder.fit_transform(df3['Author'].astype(str))

# Use TF-Idf3 Vectorizer to convert GenreCombined into a matrix of token counts
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df3['GenreCombined'])

# Reshape Author_encoded to be a 2D array (required for cosine_similarity)
Author_encoded_reshaped = df3['Author_encoded'].values.reshape(-1, 1)

# Compute cosine similarity between all books based on GenreCombined and Authors
cosine_sim_genre = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim_Author = cosine_similarity(Author_encoded_reshaped, Author_encoded_reshaped)

# Function to get book recommendations based on genre, title, and Author
def get_recommendations(title, input_genre=None, input_Author=None, df3=df3, cosine_sim_genre=cosine_sim_genre, cosine_sim_Author=cosine_sim_Author):
    # Check if the title exists in the dataset
    if title not in df3['Title'].values:
        print(f"The book titled '{title}' was not found in the dataset.")
        return pd.DataFrame()

    # Get the index of the book that matches the title
    idx = df3[df3['Title'] == title].index[0]

    # Initialize a list to store the final similarity scores
    final_sim_scores = []

    # Calculate similarity scores based on genre
    if input_genre:
        genre_books = df3[df3['GenreCombined'].str.contains(input_genre, case=False)]
        genre_indices = genre_books.index
        genre_sim_scores = list(enumerate(cosine_sim_genre[idx]))
        genre_sim_scores = [score for score in genre_sim_scores if score[0] in genre_indices]
        final_sim_scores.extend(genre_sim_scores)

    # Calculate similarity scores based on Author
    if input_Author:
        Author_books = df3[df3['Author'] == input_Author]
        Author_indices = Author_books.index
        Author_sim_scores = enumerate(cosine_sim_Author[idx])
        Author_sim_scores = listt()
        Author_sim_scores = [score for score in Author_sim_scores if score[0] in Author_indices]
        final_sim_scores.extend(Author_sim_scores)

    # Calculate similarity scores based on title (same genre and Author)
    title_sim_scores = list(enumerate((cosine_sim_genre[idx] + cosine_sim_Author[idx]) / 2))
    final_sim_scores.extend(title_sim_scores)

    # Remove duplicates and sort the books based on the combined similarity scores
    final_sim_scores = list(set(final_sim_scores))
    final_sim_scores = sorted(final_sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar books
    sim_indices = [i[0] for i in final_sim_scores[1:7]]

    # Return the top 10 most similar books
    return df3.iloc[sim_indices]

# Example: Get recommendations for a specific book title, genre, and Author
book_title = 'Data Smart'
input_genre = 'data_science'
input_Author = 'John'
recommendations = get_recommendations(book_title, input_genre, input_Author)

# Display the recommended books if found
if not recommendations.empty:
    print(recommendations[['Title', 'Author', 'GenreCombined', 'Publisher', 'Height']])



In [69]:
#importing all the necessary libraries required to create the model

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


#Load the dataset
books = pd.read_csv('./Dataset/books.csv')
books_new = pd.read_csv('./Dataset/books_new.csv')


# Merging both the datasets 
df = pd.concat([books,books_new])

#dropping duplicate rows if any
df.drop_duplicates(inplace=True)

# Filling Null Values with Unknown Value
df['SubGenre'] = df['SubGenre'].fillna('Unknown')


# Filling Null Values with Unknown Value
df['Author'] = df['Author'].fillna('Unknown Author')

# Filling Null Values with Unknown Value
df['Publisher'] = df['Publisher'].fillna('Unknown Publisher')


#copying data into another variable to try some ideas
df1 = df.copy()

# Trying to fill SubGenre Null values to a predicted subgenre value


#label endoding the categorical fields
labelencoders = {}
categorical_columns = ['Title','Author','Genre','Publisher','SubGenre']

for columns in categorical_columns:
    labelencoders[columns] = LabelEncoder()
    df1[columns] = labelencoders[columns].fit_transform(df1[columns])
    
# creating different variables to distinguish null valued subgenre rows and non-null value rows    
books_unknown = df1[df1.SubGenre ==0]
books_known = df1[df1['SubGenre']!=0]

#label encoding subgenre column in known subgenre variable
subgenre_encoder = LabelEncoder()
subgenre_endocded = subgenre_encoder.fit_transform(books_known['SubGenre'])

# preparing training and testing data for training and testing
X = books_known.drop(columns = ['SubGenre'])
y = books_known['SubGenre']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=5)

# training the model
clf = RandomForestClassifier()
clf.fit(X_train,y_train)

#testing the model
X_unknown = books_unknown.drop(columns=['SubGenre'])

books_unknown['SubGenre'] = subgenre_encoder.inverse_transform(clf.predict(X_unknown))

# Combined the datasets back
df2 = pd.concat([books_known, books_unknown])

#converted back the categorical features to original labels
for column in categorical_columns:
    df2[column] = labelencoders[column].inverse_transform(df2[column])


df2['GenreCombined'] = df2['Genre'] + ',' + df2['SubGenre']


# Again Copying the dataset

df3 = df2.copy()


# combining Genre and SubGenre Columns
df3['GenreCombined'].replace(r',Unknown',r'',regex=True,inplace=True)



# Dropping Genre and SubGenre columns because they are not needed anymore
df3.drop(columns=['Genre','SubGenre'],inplace=True)

# # Used TF-IDF Vectorizer to convert genre_combined into a matrix of token counts
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df3['GenreCombined'])

# finding similarity between all books based on Genre
similarity= cosine_similarity(tfidf_matrix,tfidf_matrix)


genre_books = df3[df3['GenreCombined'].str.contains('data_science', case=False)]


# Encode the Author's name
label_encoder = LabelEncoder()
df3['Author_encoded'] = label_encoder.fit_transform(df3['Author'].astype(str))

# Use TF-Idf3 Vectorizer to convert GenreCombined into a matrix of token counts
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df3['GenreCombined'])

# Reshape Author_encoded to be a 2D array (required for cosine_similarity)
Author_encoded_reshaped = df3['Author_encoded'].values.reshape(-1, 1)

# Compute cosine similarity between all books based on GenreCombined and Authors
cosine_sim_genre = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim_Author = cosine_similarity(Author_encoded_reshaped, Author_encoded_reshaped)

# Function to get book recommendations based on genre, title, and Author
def get_recommendations(title, input_genre=None, input_Author=None, df3=df3, cosine_sim_genre=cosine_sim_genre, cosine_sim_Author=cosine_sim_Author):
    # Check if the title exists in the dataset
    if title not in df3['Title'].values:
        print(f"The book titled '{title}' was not found in the dataset.")
        return pd.DataFrame()

    # Get the index of the book that matches the title
    idx = df3[df3['Title'] == title].index[0]

    # Initialize a list to store the final similarity scores
    final_sim_scores = []

    # Calculate similarity scores based on genre
    if input_genre:
        genre_books = df3[df3['GenreCombined'].str.contains(input_genre, case=False)]
        genre_indices = genre_books.index
        genre_sim_scores = list(enumerate(cosine_sim_genre[idx]))
        genre_sim_scores = [score for score in genre_sim_scores if score[0] in genre_indices]
        final_sim_scores.extend(genre_sim_scores)

    # Calculate similarity scores based on Author
    if input_Author:
        Author_books = df3[df3['Author'] == input_Author]
        Author_indices = Author_books.index
        Author_sim_scores = enumerate(cosine_sim_Author[idx])
        Author_sim_scores = list(Author_sim_scores)
        Author_sim_scores = [score for score in Author_sim_scores if score[0] in Author_indices]
        final_sim_scores.extend(Author_sim_scores)

    # Calculate similarity scores based on title (same genre and Author)
    title_sim_scores = list(enumerate((cosine_sim_genre[idx] + cosine_sim_Author[idx]) / 2))
    final_sim_scores.extend(title_sim_scores)

    # Remove duplicates and sort the books based on the combined similarity scores
    final_sim_scores = list(set(final_sim_scores))
    final_sim_scores = sorted(final_sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar books
    sim_indices = [i[0] for i in final_sim_scores[1:7]]

    # Return the top 10 most similar books
    return df3.iloc[sim_indices]


# Testing the model

# Example: Get recommendations for a specific book title, genre, and Author
book_title = 'Data Smart'
input_genre = 'data_science'
input_Author = 'John'
recommendations = get_recommendations(book_title, input_genre, input_Author)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_unknown['SubGenre'] = subgenre_encoder.inverse_transform(clf.predict(X_unknown))


In [70]:
recommendations

Unnamed: 0,Title,Author,Height,Publisher,GenreCombined,Author_encoded
23,"Signal and the Noise, The","Silver, Nate",233,Penguin,"tech,data_science",106
157,Neural Networks,"Haykin, Simon",240,Unknown Publisher,"tech,data_science",57
160,Statistical Learning Theory,"Vapnik, Vladimir",228,Unknown Publisher,"tech,data_science",120
10,Data Scientists at Work,Sebastian Gutierrez,230,Apress,"tech,data_science",103
22,Machine Learning for Hackers,"Conway, Drew",233,O'Reilly,"tech,data_science",13
5,"Nature of Statistical Learning Theory, The","Vapnik, Vladimir",230,Springer,"tech,data_science",120
