In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('goodreads_data.csv')

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL
0,0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Ki...
1,1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...


In [4]:
df.shape

(10000, 8)

In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
df.columns

Index(['Book', 'Author', 'Description', 'Genres', 'Avg_Rating', 'Num_Ratings',
       'URL'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Book         10000 non-null  object 
 1   Author       10000 non-null  object 
 2   Description  9923 non-null   object 
 3   Genres       10000 non-null  object 
 4   Avg_Rating   10000 non-null  float64
 5   Num_Ratings  10000 non-null  object 
 6   URL          10000 non-null  object 
dtypes: float64(1), object(6)
memory usage: 547.0+ KB


In [8]:
df.isnull().sum()

Book            0
Author          0
Description    77
Genres          0
Avg_Rating      0
Num_Ratings     0
URL             0
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
fig = px.histogram(df, x='Avg_Rating', nbins=50, title='Distribution of Average Ratings', template='plotly_dark')
fig.update_layout(xaxis_title='Average Rating', yaxis_title='Count')
fig.show()

In [11]:
# Get top authors by number of books
author_counts = df['Author'].value_counts().head(10)

# Create a bar plot for top authors
fig = px.bar(
    x=author_counts.index, 
    y=author_counts.values,
    title='Top 10 Authors by Number of Books',
    labels={'x': 'Authors', 'y': 'Number of Books'},
    template='plotly_dark'
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [12]:
df.columns

Index(['Book', 'Author', 'Description', 'Genres', 'Avg_Rating', 'Num_Ratings',
       'URL'],
      dtype='object')

In [13]:
# Find the book with the highest average rating
highest_rated_book = df.loc[df['Avg_Rating'].idxmax()]

# Display the book details
print("Book with the highest average rating:")
print(f"Title: {highest_rated_book['Book']}")
print(f"Author(s): {highest_rated_book['Author']}")
print(f"Average Rating: {highest_rated_book['Avg_Rating']}")

# Show the complete record
print("\nComplete record:")
highest_rated_book


Book with the highest average rating:
Title: Joey Wheeler: The Official Character & Monster Guide
Author(s): Arthur "Sam" Murakami
Average Rating: 5.0

Complete record:


Book           Joey Wheeler: The Official Character & Monster...
Author                                     Arthur "Sam" Murakami
Description    "Check out this official character and monster...
Genres                                                        []
Avg_Rating                                                   5.0
Num_Ratings                                                    2
URL            https://www.goodreads.com/book/show/2114514.Jo...
Name: 3737, dtype: object

In [14]:
# Find the book with the lowest average rating
lowest_rated_book = df.loc[df['Avg_Rating'].idxmin()]

# Display the book details
print("Book with the lowest average rating:")
print(f"Title: {lowest_rated_book['Book']}")
print(f"Author(s): {lowest_rated_book['Author']}")
print(f"Average Rating: {lowest_rated_book['Avg_Rating']}")

# Show the complete record
print("\nComplete record:")
lowest_rated_book

Book with the lowest average rating:
Title: Broken: The Failed Promise of Muslim Inclusion
Author(s): Evelyn Alsultany
Average Rating: 0.0

Complete record:


Book              Broken: The Failed Promise of Muslim Inclusion
Author                                          Evelyn Alsultany
Description    PROSE Award- Media and Cultural Studies Finali...
Genres                                                        []
Avg_Rating                                                   0.0
Num_Ratings                                                    0
URL            https://www.goodreads.com/book/show/75268277-b...
Name: 3747, dtype: object

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['Description'])

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)