In [None]:
# Book recomendation program.

# Get the data

In [43]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#load the data
from google.colab import files
files.upload()

In [None]:
# Store the data set
df_books = pd.read_csv('Books_FINAL.csv', encoding='unicode_escape',error_bad_lines=False)

In [60]:
# Take a look at the data set
df_books.head()

Unnamed: 0,row_id,Title,Vendor,Tags,Description,Location,Category,Date,Download Link,WikiLink
0,0,"Bushido, the Soul of Japan",Inazo Nitobe,19th century,Bushido: The Soul of Japan is a book written b...,Japanese,Philosophy,1899,https://www.gutenberg.org/cache/epub/12096/pg1...,https://en.wikipedia.org/wiki/Bushido:_The_Sou...
1,1,The Beautiful and Damned,F. Scott Fitzgerald,20th century,The Beautiful and Damned is a tragic novel by ...,American,Fiction,1922,https://www.gutenberg.org/cache/epub/9830/pg98...,https://en.wikipedia.org/wiki/The_Beautiful_an...
2,2,The Count of Monte Cristo,Alexandre Dumas,19th century,The Count of Monte Cristo is an adventure nove...,French,Fiction,1845,https://www.gutenberg.org/cache/epub/1184/pg11...,https://en.wikipedia.org/wiki/The_Count_of_Mon...
3,3,The Duty Of Civil Disobedience,Henry David Thoreau,19th century,"Resistance to Civil Government, also known as ...",American,Political Philosophy,1849,https://www.gutenberg.org/cache/epub/205/pg205...,https://en.wikipedia.org/wiki/Civil_Disobedien...
4,4,The Jungle Book,Rudyard Kipling,19th century,The Jungle Book by Rudyard Kipling is a collec...,English,Fiction,1894,https://www.gutenberg.org/cache/epub/236/pg236...,https://en.wikipedia.org/wiki/The_Jungle_Book


# Clean the data

In [61]:
# Rename 'Vendor' column to 'Author'
df_books.rename(columns={'Vendor': 'Author'}, inplace=True)

# Clean the data by dropping na values in critical cells
df_books = df_books.dropna(subset=['WikiLink','Location','Date','Tags','Title','Author'])




# Feature engineering

In [62]:
# Create a function to combine the important columns / features
def combine_features(data):
    features = data.apply(lambda row: ' '.join(map(str, row)), axis=1)
    return features

In [None]:
# Build the function that will use the wikiLink column to scrape wiki for the description
# of the book to be put in the subject column

import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import unicodedata

def summarize_wiki(url, max_words=500):
    try:
        # Send a GET request to the Wikipedia page
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main content of the article
        main_content = soup.find('div', {'id': 'mw-content-text'})

        # Extract text from paragraphs
        paragraphs = main_content.find_all('p') if main_content else []

        # Join and return the first few paragraphs as a summary
        summary_words = []
        word_count = 0

        for p in paragraphs:
            # Remove special characters and ensure valid Unicode characters
            words = [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8') for word in re.findall(r'\b\w+\b', p.get_text())]

            if word_count + len(words) <= max_words:
                summary_words.extend(words)
                word_count += len(words)
            else:
                break

        summary = ' '.join(summary_words)
        return summary
    except Exception as e:
        print(f"Error fetching or summarizing the content: {e}")
        return None



# Example usage:
wiki_url = 'https://en.wikipedia.org/wiki/Bushido:_The_Soul_of_Japan'
summarize_wiki(wiki_url, max_words=50)

In [None]:
df_books['Subject'] = df_books['WikiLink'].apply(lambda url: summarize_wiki(url, max_words=150) if pd.notnull(url) else None)
df_books['Subject'].head(1)

In [None]:
# Create column to store combine_features
df_books['Location_Date'] = combine_features(df_books[['Location','Date']])
df_books['Location_Date'].head(1)

# Recomendations

In [None]:
def recommend_books(book_id, df_books, top_n=5):
    # Extract title of the book for which recommendations are needed
    title = df_books['Title'][book_id]

    # Convert the text from the new column to a matrix of word counts
    cm = CountVectorizer().fit_transform(df_books['Location_Date'])

    # do the same for the subject
    sub = CountVectorizer().fit_transform(df_books['Subject'])

    # do the same for the vendor
    ven = CountVectorizer().fit_transform(df_books['Author'])

    # do the same for the cat
    cat = CountVectorizer().fit_transform(df_books['Category'])

    # Get the cosine similarity for the combine_features
    cs = cosine_similarity(cm)
    subs = cosine_similarity(sub)
    vens = cosine_similarity(ven)
    cats = cosine_similarity(cat)

    # Weight the values
    combined_scores = 0.18 * vens + 0.05 * cs + 0.72 * subs + 0.05 * cats

    # Create a list of tuples in the form (book_id, similarity score)
    scores = list(enumerate(combined_scores[book_id]))

    # Sort the list of similar books in descending order
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    sorted_scores = sorted_scores[1:]

    # Extract the top N recommended books
    top_n_recommendations = [(df_books.row_id.iloc[item[0]], df_books.Title.iloc[item[0]], df_books.Author.iloc[item[0]], item[1]) for item in sorted_scores[:top_n]]

    return top_n_recommendations

# Example usage: put in a book number between 0 and 74
book_id = 2
recommended_books = recommend_books(book_id, df_books, top_n=5)
print(recommended_books)



In [77]:
df_books['Related_Products'] = ''

# Loop through all rows and populate the 'Related_Products' column with top 5 recommendations
for index, row in df_books.iterrows():
    book_id = int(row['row_id'])  # Convert row_id to integer

    # Check if the book_id is within the valid range
    if 0 <= book_id < len(df_books):
        # Call the recommend_books function to get top 5 recommendations
        recommended_books = recommend_books(book_id, df_books, top_n=5)

        # Extract titles from the recommended books and join them
        titles_str = '; '.join([item[1] for item in recommended_books])

        # Update the 'Related_Products' column
        df_books.at[index, 'Related_Products'] = titles_str
    else:
        # Handle the case where the book_id is out of bounds
        df_books.at[index, 'Related_Products'] = ''



# Download items

In [78]:
# Save the updated DataFrame to a new CSV file
df_books.to_csv('test.csv', index=False)

# Download the CSV file
from google.colab import files
files.download('test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>