In [10]:
import pandas as pd

# Load your datasets (semicolon separator)
books = pd.read_csv('BX_Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
users = pd.read_csv('BX-Users.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

# Check the shape of each dataframe
print("Books:", books.shape)
print("Users:", users.shape)
print("Ratings:", ratings.shape)

# Filter users who have rated at least 200 books
active_users = ratings['User-ID'].value_counts()
active_users = active_users[active_users >= 200].index
ratings_filtered = ratings[ratings['User-ID'].isin(active_users)]

# Filter books that have at least 100 ratings
popular_books = ratings_filtered['ISBN'].value_counts()
popular_books = popular_books[popular_books >= 100].index
ratings_filtered = ratings_filtered[ratings_filtered['ISBN'].isin(popular_books)]

# Check the new shape after filtering
print("Filtered Ratings:", ratings_filtered.shape)
# Create the pivot table
book_user_matrix = ratings_filtered.pivot_table(index='ISBN', columns='User-ID', values='Book-Rating')

# Fill NaN with 0
book_user_matrix.fillna(0, inplace=True)

# Check the shape
print("Book-User Matrix shape:", book_user_matrix.shape)







Books: (271379, 8)
Users: (278858, 3)
Ratings: (1149780, 3)
Filtered Ratings: (13793, 3)
Book-User Matrix shape: (100, 857)


In [14]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [16]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

book_user_sparse_matrix = csr_matrix(book_user_matrix.values)

model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(book_user_sparse_matrix)

print("✅ KNN model trained successfully!")


✅ KNN model trained successfully!


In [21]:
def get_recommends(book_title, model=model, data=book_user_matrix):
    # Get the index of the book
    if book_title not in data.index:
        return [book_title, []]  # book not found

    book_index = data.index.get_loc(book_title)

    # Reshape and query the model
    distances, indices = model.kneighbors(data.iloc[book_index, :].values.reshape(1, -1), n_neighbors=6)

    # Build the result (skip the first one as it's the input book itself)
    recommended_books = []
    for i in range(1, len(distances[0])):
        title = data.index[indices[0][i]]
        dist = distances[0][i]
        recommended_books.append([title, dist])

    return [book_title, recommended_books]


In [23]:
book_user_matrix.index[:20].tolist()


['0060502258',
 '0060928336',
 '0060930535',
 '0060934417',
 '0060976845',
 '0060987103',
 '0061009059',
 '006101351X',
 '014028009X',
 '0140293248',
 '0142001740',
 '0156027321',
 '0312195516',
 '0312278586',
 '0312924585',
 '0312966091',
 '0312983271',
 '0316096199',
 '0316284955',
 '0316601950']

In [30]:
import pandas as pd

# Try reading with a semicolon delimiter
users = pd.read_csv(r'C:\Users\vishw\OneDrive\Desktop\book_recommender_project\BX-Users.csv', 
                    encoding='latin-1', 
                    sep=';')

print(users.head())
print(users.info())


   User-ID                            Location   Age
0        1                  nyc, new york, usa   NaN
1        2           stockton, california, usa  18.0
2        3     moscow, yukon territory, russia   NaN
3        4           porto, v.n.gaia, portugal  17.0
4        5  farnborough, hants, united kingdom   NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB
None


In [32]:
import pandas as pd

# Load the book ratings data
ratings = pd.read_csv(r'C:\Users\vishw\OneDrive\Desktop\book_recommender_project\BX-Book-Ratings.csv', encoding='latin-1',sep=';')

# Show the first few rows
print(ratings.head())

# Show data summary
print(ratings.info())


   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB
None


In [39]:
# Filter users who have rated at least 100 books
active_users = ratings['User-ID'].value_counts()
active_users = active_users[active_users >= 100].index
ratings_filtered = ratings[ratings['User-ID'].isin(active_users)]

# Filter books that have at least 50 ratings
popular_books = ratings_filtered['ISBN'].value_counts()
popular_books = popular_books[popular_books >= 50].index
ratings_filtered = ratings_filtered[ratings_filtered['ISBN'].isin(popular_books)]

# Merge with books to get book titles
books = pd.read_csv(r'C:\Users\vishw\OneDrive\Desktop\book_recommender_project\BX_Books.csv',
                    encoding='latin-1', sep=';', on_bad_lines='warn')

# Remove potential whitespace from ISBN
ratings_filtered['ISBN'] = ratings_filtered['ISBN'].str.strip()
books['ISBN'] = books['ISBN'].str.strip()

# Now merge
ratings_with_books = ratings_filtered.merge(books, on='ISBN')

# Show result
print(ratings_with_books.head())
print(f"Shape of merged data: {ratings_with_books.shape}")


   User-ID        ISBN  Book-Rating  \
0   276925  002542730X           10   
1   276925  0316666343            0   
2   276925  0385504209            8   
3   276925  0804106304            0   
4   276925  0971880107            0   

                                          Book-Title        Book-Author  \
0  Politically Correct Bedtime Stories: Modern Ta...  James Finn Garner   
1                          The Lovely Bones: A Novel       Alice Sebold   
2                                  The Da Vinci Code          Dan Brown   
3                                  The Joy Luck Club            Amy Tan   
4                                        Wild Animus       Rich Shapero   

   Year-Of-Publication              Publisher  \
0                 1994  John Wiley & Sons Inc   
1                 2002          Little, Brown   
2                 2003              Doubleday   
3                 1994   Prentice Hall (K-12)   
4                 2004                Too Far   

                   

In [40]:
# Step 1: Clean ISBN columns
ratings_filtered['ISBN'] = ratings_filtered['ISBN'].str.strip()
books['ISBN'] = books['ISBN'].str.strip()

# Step 2: Merge ratings with books to include titles
ratings_with_books = ratings_filtered.merge(books, on='ISBN')

# Step 3: Display a few rows to confirm
print(ratings_with_books[['User-ID', 'ISBN', 'Book-Title', 'Book-Rating']].head())


   User-ID        ISBN                                         Book-Title  \
0   276925  002542730X  Politically Correct Bedtime Stories: Modern Ta...   
1   276925  0316666343                          The Lovely Bones: A Novel   
2   276925  0385504209                                  The Da Vinci Code   
3   276925  0804106304                                  The Joy Luck Club   
4   276925  0971880107                                        Wild Animus   

   Book-Rating  
0           10  
1            0  
2            8  
3            0  
4            0  


In [52]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Step 1: Create a pivot table (users as rows, books as columns)
user_item_pivot = ratings_with_books.pivot_table(index='User-ID',
                                                 columns='Book-Title',
                                                 values='Book-Rating',
                                                 fill_value=0)

# Step 2: Convert pivot table to a sparse matrix for efficiency
user_item_sparse = csr_matrix(user_item_pivot.values)

# Step 3: Train the KNN model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_item_sparse)


In [63]:
# Step 1: Create the pivot table with books as rows and users as columns
book_user_pivot = ratings_with_books.pivot_table(index='Book-Title',
                                                 columns='User-ID',
                                                 values='Book-Rating',
                                                 fill_value=0)

# Step 2: Convert to sparse matrix
book_user_sparse = csr_matrix(book_user_pivot.values)

# Step 3: Train the KNN model on books
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(book_user_sparse)


In [64]:
def recommend_books(book_title, n_recommendations=5):
    if book_title not in book_user_pivot.index:
        return f"❌ '{book_title}' not found in the book list."

    book_vector = book_user_pivot.loc[book_title].values.reshape(1, -1)

    distances, indices = knn_model.kneighbors(book_vector, n_neighbors=n_recommendations + 1)

    recommended_books = [book_user_pivot.index[i] for i in indices.flatten() if book_user_pivot.index[i] != book_title]

    return recommended_books[:n_recommendations]


In [66]:

print(recommend_books("The Lovely Bones: A Novel"))


["Where the Heart Is (Oprah's Book Club (Paperback))", 'Lucky', "The Book of Ruth (Oprah's Book Club (Paperback))", "Suzanne's Diary for Nicholas", 'Nights in Rodanthe']


In [67]:
while True:
    user_input = input("\n📚 Enter a book title (or type 'exit' to quit): ").strip()
    
    if user_input.lower() == 'exit':
        print("👋 Exiting recommendation system. Goodbye!")
        break
    
    recommendations = recommend_books(user_input)
    
    if isinstance(recommendations, str):
        print(recommendations)  # Error message
    else:
        print("\n🔎 Top recommended books:")
        for idx, title in enumerate(recommendations, start=1):
            print(f"{idx}. {title}")



📚 Enter a book title (or type 'exit' to quit):  The Lovely Bones: A Novel



🔎 Top recommended books:
1. Where the Heart Is (Oprah's Book Club (Paperback))
2. Lucky
3. The Book of Ruth (Oprah's Book Club (Paperback))
4. Suzanne's Diary for Nicholas
5. Nights in Rodanthe



📚 Enter a book title (or type 'exit' to quit):  exit


👋 Exiting recommendation system. Goodbye!


In [68]:
pip install streamlit


Note: you may need to restart the kernel to use updated packages.


2025-05-20 23:28:52.006 
  command:

    streamlit run C:\Users\vishw\AppData\Local\Programs\Python\Python310\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-20 23:28:52.015 Session state does not function when running a script without `streamlit run`


In [72]:
streamlit run 

SyntaxError: invalid syntax (1535616767.py, line 1)