In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('cleaned_book_store_data.csv', delimiter=',', encoding='ISO-8859-1')

In [5]:
df.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M,User-ID,Age,Book-Rating
0,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,35,5
1,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,35,8
2,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,67544,30,8
3,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,116866,35,9
4,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,123629,35,9


# New Users

## Step 1: Group books by Age and get top rated books

In [59]:
def get_popular_books_by_age(df, n=10):
    # Group by age, calculate average ratings and counts
    age_group = df.groupby('Age').agg({
        'Book-Title': lambda x: list(x),  # List of book titles read by users of that age
        'Book-Rating': 'mean',  # Average rating for the age
        'Year-Of-Publication': 'first'  # To handle recent books
    })
    return age_group[['Book-Title']]


In [61]:
popular_books_by_age = get_popular_books_by_age(df)

## Step 2: Recommend books for new users based on their age

In [73]:
def recommend_books_for_new_user(age, popular_books_by_age, n=10):
    # Ensure age is an integer for comparison
    if age in popular_books_by_age.index:
        # Retrieve and return the book titles for the specified age
        return popular_books_by_age.loc[age, 'Book-Title'][:n]  # Return the list of book titles
    else:
        return "No data available for this age group. Here are the most popular books."


In [78]:
age = 20  # Replace with the user's age
popular_books_by_age = get_popular_books_by_age(df)  # Ensure this is called to get the updated DataFrame
recommendations = recommend_books_for_new_user(age, popular_books_by_age)

if recommendations == "No data available for this age group.":
    national_popular_books = df['Book-Title'].value_counts().head(10).index.tolist()  # Get titles as a list
    recommendations = national_popular_books

# Display the recommendations
print("Recommended Books:")
recommendations

Recommended Books:


['The Testament',
 'New Vegetarian: Bold and Beautiful Recipes for Every Occasion',
 'Wild Animus',
 'Wild Animus',
 'Wild Animus',
 'Wild Animus',
 'Wild Animus',
 'Wild Animus',
 'Wild Animus',
 'Wild Animus']

# New Books

## Create a Function to Identify New Books

In [133]:
# Function to get user IDs by author
def get_user_ids_by_author(author_name, df):
    # Retrieve unique user IDs for books by the specified author
    user_ids = df[df['Book-Author'] == author_name]['User-ID'].unique()
    return user_ids.tolist()  # Convert to list for easier handling

In [135]:
# Example author name
author_name = 'frank mccourt'  # Replace with the author's name you want to query
user_ids = get_user_ids_by_author(author_name, df)


In [137]:
# Display user IDs
if user_ids:
    # Ensure user_ids are strings for joining
    user_ids_str = [str(user_id) for user_id in user_ids]
    print(f"Recommend your new book by {author_name} to users: {', '.join(user_ids_str)}")
else:
    print(f"No users found who have rated books by {author_name}.")

Recommend your new book by frank mccourt to users: 1435, 11362, 11676, 14546, 15351, 15408, 17058, 18439, 25253, 26583, 31229, 33816, 37133, 37989, 39345, 47293, 54884, 57234, 57599, 61038, 67402, 70931, 73884, 75306, 75556, 78015, 83698, 105517, 108118, 111656, 112435, 112982, 113270, 120090, 121731, 125774, 126296, 130571, 130950, 136071, 144062, 146266, 152651, 161467, 162917, 166581, 168743, 179189, 183257, 183883, 185087, 187256, 187863, 188057, 189782, 192662, 195904, 206305, 212970, 215388, 219008, 221655, 223605, 225043, 229313, 231210, 237990, 239020, 244657, 250764, 252901, 253310, 256407, 257700, 269349, 270820, 274362, 503, 1025, 4017, 6242, 6251, 6575, 6577, 8937, 14422, 14451, 17950, 23902, 28634, 29041, 30276, 35424, 41620, 45315, 46398, 53946, 55539, 56959, 60244, 64968, 74175, 81207, 85526, 86202, 90646, 94376, 98904, 99832, 101876, 102967, 104125, 111133, 112507, 116395, 116876, 117111, 117308, 136720, 142121, 143587, 149071, 154543, 159720, 165319, 178151, 179906, 18

In [125]:
# Group by 'Book-Author' and count the number of unique 'Book-Title' entries
author_book_counts = df.groupby('Book-Author')['Book-Title'].count().reset_index()

# Rename the columns for clarity
author_book_counts.columns = ['Book-Author', 'Number of Books']

# Sort the authors by the number of books in descending order
author_book_counts = author_book_counts.sort_values(by='Number of Books', ascending=False)

# Display the result
print(author_book_counts.head(50))  # Display the top 50 authors


                     Book-Author  Number of Books
53911               stephen king             4703
28741               john grisham             3670
43375               nora roberts             3001
25061            james patterson             2387
23883              j. k. rowling             1746
39148         mary higgins clark             1713
40401           michael crichton             1705
3104                   anne rice             1630
25686            janet evanovich             1490
12770             dean r. koontz             1475
11398             danielle steel             1452
54544                sue grafton             1404
56994                 tom clancy             1225
42984            nicholas sparks             1131
11027                  dan brown             1115
2571                anita shreve             1027
4417          barbara kingsolver             1008
44409  patricia daniels cornwell              943
1430                alice sebold              928
