<a href="https://colab.research.google.com/github/TamannaAhmad/library-recommendations/blob/main/hybrid_recommendations_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('wordnet')         #WordNet synsets and lemmas
nltk.download('stopwords')       #stopwords
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
import pandas as pd

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gryff\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gryff\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gryff\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#load data
books = pd.read_csv('data/books.csv')  #read the books file
courses = pd.read_csv('data/courses.csv')  #read the courses file

In [4]:
try:
  books = books.drop(['Unnamed: 5'], axis = 1)
  books = books.drop(['Unnamed: 6'], axis = 1)
except:
  pass

In [5]:
books.head()

Unnamed: 0,book_id,title,author,edition,pub_year
0,1,Software Engineering - A Practitioners Approach,Roger S. Pressman,7.0,
1,2,Software Project Management,"Bob Hughes, Mike Cotterell, Rajib Mall",6.0,2018.0
2,3,An Integrated Approach to Software Engineering,Pankaj Jalote,,
3,4,Software Engineering: Principles and Practice,Hans van Vliet,3.0,2010.0
4,5,Data Communications and Networking,Behrouz A. Forouzan,5.0,2013.0


In [6]:
#preprocess and combine features
def preprocess_text(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])  #remove punctuation
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])  #stem and remove stopwords
    return text

#function for content-based recommending
def recommend_books_by_syllabus(course_keywords, books_df, tfidf_matrix):
    course_vec = tfidf.transform([course_keywords])
    similarities = cosine_similarity(course_vec, tfidf_matrix).flatten()
    indices = similarities.argsort()
    recommended_books = books_df.iloc[indices]
    return filter_recommendations(recommended_books, course_keywords)

#filter recommendations for accuracy
def filter_recommendations(recommended_books, course_keywords):
    return recommended_books[recommended_books['title'].str.contains(course_keywords, case=False)]

books['processed_title'] = books['title'].apply(preprocess_text)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['processed_title'])

#example
course_keywords = "java"
recommended_books = recommend_books_by_syllabus(course_keywords, books, tfidf_matrix)
recommended_books = recommended_books.drop(['processed_title'], axis = 1)
print(recommended_books)


     book_id                         title  \
125      126  Java: The Complete Reference   
126      127                     Java EE 5   

                                                author  edition  pub_year  
125                  Herbert Schildt, Dr. Danny Coward     13.0       NaN  
126  Ivan Bayross, Vaishali Shah, Sharanam Shah, Cy...      2.0       NaN  


# Sample Data for Collaborative Recommendation System

In [7]:
import pandas as pd

books_df = pd.read_csv('data/books.csv')
users_df = pd.read_csv('data/sample user data.csv')  #sample user data

In [8]:
merged_df = users_df.merge(books_df, on='book_id', how='left')
print(merged_df.head())

   user_id  semester branch  book_id  \
0        1         5    AD         2   
1        1         5    AD        43   
2        1         5    AD        39   
3        1         5    AD        28   
4        2         4    AD        64   

                                               title  \
0                        Software Project Management   
1                            Artificial Intelligence   
2  Introduction to the Design and Analysis of Alg...   
3  Python Data Science Handbook: Essential Tools ...   
4                   Mathematics for Machine learning   

                                              author  edition  pub_year  
0             Bob Hughes, Mike Cotterell, Rajib Mall      6.0    2018.0  
1                 Stuart J. Russell and Peter Norvig      3.0    2015.0  
2                                      Anany Levitin      3.0    2017.0  
3                                    Jake VanderPlas      2.0    2022.0  
4  Marc Peter Deisennroth, A. Aldo Faisal, Cheng ...

In [9]:
user_book_matrix = users_df.pivot_table(index='user_id', columns='book_id', aggfunc='size', fill_value=0) #1 signifies book borrowed by user; 0 book not borrowed
print(user_book_matrix.head())

book_id  1    2    9    10   15   16   20   28   39   42   ...  80   84   99   \
user_id                                                    ...                  
1          0    1    0    0    0    0    0    1    1    0  ...    0    0    0   
2          0    0    0    0    1    1    0    0    0    0  ...    0    0    0   
3          0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
4          1    0    0    0    0    0    0    0    0    0  ...    1    1    0   
5          0    0    0    0    0    0    0    0    1    0  ...    0    0    0   

book_id  103  106  112  122  124  125  127  
user_id                                     
1          0    0    0    0    0    0    0  
2          0    0    0    0    0    0    0  
3          0    1    0    0    0    0    1  
4          0    0    0    0    0    0    0  
5          1    0    0    0    0    1    0  

[5 rows x 24 columns]


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

#cosine similarity for books
book_similarity = cosine_similarity(user_book_matrix.T)
book_similarity_df = pd.DataFrame(book_similarity, index=user_book_matrix.columns, columns=user_book_matrix.columns)

print(book_similarity_df.head())

book_id  1         2         9    10   15   16        20        28   39   42   \
book_id                                                                         
1        1.0  0.000000  0.000000  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0   
2        0.0  1.000000  0.707107  0.0  0.0  0.0  0.707107  1.000000  0.5  0.0   
9        0.0  0.707107  1.000000  0.0  0.0  0.0  1.000000  0.707107  0.0  0.0   
10       0.0  0.000000  0.000000  1.0  0.0  0.0  0.000000  0.000000  0.0  0.0   
15       0.0  0.000000  0.000000  0.0  1.0  1.0  0.000000  0.000000  0.0  0.0   

book_id  ...  80   84   99   103  106  112  122  124  125  127  
book_id  ...                                                    
1        ...  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2        ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
9        ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
10       ...  0.0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  0.0  0.0  
15       ...  0.0  0.0  0.0  0.0  0.0  0.0

In [11]:
import numpy as np

def hybrid_recommend_books(book_title, books_df, tfidf_matrix, tfidf, book_similarity_df, top_n = 5, threshold = 0.2):
    books_copy = books_df.copy()
    #normalise input and dataset
    books_copy['title'] = books_copy['title'].str.strip().str.lower()
    book_title = book_title.strip().lower()
    #check is title is in books_copy
    if book_title not in books_copy['title'].values:
        print(f"Book '{book_title}' not found in dataset.")
        return pd.DataFrame()
    #get index of the book
    book_idx = books_copy[books_copy['title'] == book_title].index[0]

    #content-based similarity
    content_similarities = cosine_similarity(tfidf_matrix[book_idx], tfidf_matrix).flatten()

    #collaborative similarity
    collaborative_similarities = np.zeros(len(books_df)) #default to zeros for books with no borrowing data
    book_id = books_copy.loc[book_idx, 'book_id']
    if book_id in book_similarity_df.index:
        collaborative_similarities = book_similarity_df.loc[book_id].reindex(books_copy['book_id'], fill_value=0).values

    #combine similarities with equal weights
    hybrid_similarities = (0.8 * content_similarities + 0.2 * collaborative_similarities) / 2

    #filter low similarity results
    hybrid_similarities[hybrid_similarities < threshold] = 0

    #top n recommendations
    top_indices = np.argsort(hybrid_similarities)[-top_n - 1:][::-1]  #exclude the input book
    top_indices = [idx for idx in top_indices if idx != book_idx][:top_n]

    #metadata for recommended books
    recommendations = books_df.iloc[top_indices].copy()
    recommendations['hybrid_score'] = hybrid_similarities[top_indices]

    req_data = ['book_id', 'title', 'author', 'edition', 'pub_year']
    return recommendations[req_data]

#example
book_title = "Discrete Mathematics – A Concept-based approach"
recommended_books = hybrid_recommend_books(book_title, books_df, tfidf_matrix, tfidf, book_similarity_df)
if not recommended_books.empty:
    print(recommended_books)
else:
    print("No recommendations found.")

     book_id                                      title  \
52        53  Discrete Mathematics and its Applications   
55        56     Discrete Mathematics with Applications   
133      134             Programming the World Wide Web   
48        49                Database management systems   
46        47                    Artificial Intelligence   

                    author  edition  pub_year  
52        Kenneth H. Rosen      6.0    2007.0  
55            Thomas Koshy      NaN    2005.0  
133       Robert W Sebesta      8.0    2020.0  
48    Ramakrishnan, Gehrke      3.0    2014.0  
46           Saroj Kaushik      NaN    2014.0  
