In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

In [15]:
titles=pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv', sep='\t')
titles.head()


Unnamed: 0,id,title,author,year,total_weeks,first_week,debut_rank,best_rank
0,0,"""H"" IS FOR HOMICIDE",Sue Grafton,1991,15,1991-05-05,1,2
1,1,"""I"" IS FOR INNOCENT",Sue Grafton,1992,11,1992-04-26,14,2
2,10,''G'' IS FOR GUMSHOE,Sue Grafton,1990,6,1990-05-06,4,8
3,100,A DOG'S JOURNEY,W. Bruce Cameron,2012,1,2012-05-27,3,14
4,1000,CHANGING FACES,Kimberla Lawson Roby,2006,1,2006-02-19,11,14


In [75]:
full=pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_full.tsv', sep='\t')
full.head()

Unnamed: 0,year,week,rank,title_id,title,author
0,1931,1931-10-12,1,6477,THE TEN COMMANDMENTS,Warwick Deeping
1,1931,1931-10-12,2,1808,FINCHE'S FORTUNE,Mazo de la Roche
2,1931,1931-10-12,3,5304,THE GOOD EARTH,Pearl S. Buck
3,1931,1931-10-12,4,4038,SHADOWS ON THE ROCK,Willa Cather
4,1931,1931-10-12,5,3946,SCARMOUCHE THE KING MAKER,Rafael Sabatini


In [76]:
full['week2']=pd.to_datetime(full['week'])

full.sample(5)

Unnamed: 0,year,week,rank,title_id,title,author,week2
54717,2013,2013-10-06,15,3840,ROBERT B. PARKER’S DAMNED IF YOU DO,Michael Brandman,2013-10-06
4424,1945,1945-12-09,10,142,A LION IS IN THE STREETS,Adria Locke Langley,1945-12-09
53790,2012,2012-08-19,15,6190,THE SANDCASTLE GIRLS,Chris Bohjalian,2012-08-19
42366,1998,1998-04-26,7,3453,PARADISE,Toni Morrison,1998-04-26
7207,1949,1949-04-10,9,4799,THE CHAIN,Paul I. Wellman,1949-04-10


In [77]:
authors=full.groupby(by=['author', 'title']).agg(
    number_of_appearances=pd.NamedAgg(column="week2", aggfunc="count"),
    first_week=pd.NamedAgg(column="week2", aggfunc="min"),
    last_week=pd.NamedAgg(column="week2", aggfunc="max")
).reset_index().sort_values(by='number_of_appearances', ascending=False)
authors.head()

Unnamed: 0,author,title,number_of_appearances,first_week,last_week
1675,Dr. Seuss,"OH, THE PLACES YOU'LL GO!",178,1990-02-25,2000-07-09
3176,James Redfield,THE CELESTINE PROPHECY,165,1994-03-06,1997-06-15
1129,Dan Brown,THE DA VINCI CODE,165,2003-04-06,2006-06-18
6060,Robert James Waller,THE BRIDGES OF MADISON COUNTY,164,1992-08-16,1995-10-08
464,Anthony Doerr,ALL THE LIGHT WE CANNOT SEE,132,2014-05-25,2017-01-15


In [78]:
authors2=authors.groupby(by='author').agg(
    number_of_appearances=pd.NamedAgg(column="number_of_appearances", aggfunc="sum"),
    number_of_books=pd.NamedAgg(column="number_of_appearances", aggfunc="count"),
    first_week=pd.NamedAgg(column="first_week", aggfunc="min"),
    last_week=pd.NamedAgg(column="last_week", aggfunc="max")

).reset_index().sort_values(by='number_of_appearances', ascending=False)
authors2["active_year"]=(authors2["last_week"]-authors2["first_week"])/ np.timedelta64(1, 'Y')
authors2["First Year in BSL"]=authors2["first_week"].dt.year
authors2


Unnamed: 0,author,number_of_appearances,number_of_books,first_week,last_week,active_year,First Year in BSL
382,Danielle Steel,957,116,1980-11-02,2020-12-06,40.093910,1980
1932,Stephen King,892,54,1977-03-27,2020-09-13,43.467012,1977
1080,John Grisham,789,35,1991-03-17,2020-12-06,29.725456,1991
1997,Taylor Caldwell,524,27,1938-09-26,1981-05-17,42.640164,1938
894,James A. Michener,477,16,1949-02-27,1993-01-31,43.926980,1949
...,...,...,...,...,...,...,...
1260,Lana Del Rey,1,1,2020-10-18,2020-10-18,0.000000,2020
1261,Lara Adrian,1,1,2012-02-12,2012-02-12,0.000000,2012
1269,Lars Kepler,1,1,2011-07-17,2011-07-17,0.000000,2011
1276,Lauren Graham,1,1,2013-05-19,2013-05-19,0.000000,2013


In [79]:
full.year.describe()

count    60386.000000
mean      1980.291375
std         24.440366
min       1931.000000
25%       1958.000000
50%       1982.000000
75%       2001.000000
max       2020.000000
Name: year, dtype: float64

In [80]:
fig=px.scatter(authors2, 
               x='number_of_books', 
               y='number_of_appearances', 
               color='First Year in BSL',
               hover_name='author', 
               template='none',
              title='Number of Books and Appearances in NYT Best-Sellers List <br><sup>1931-2020</sup>')
fig.update_xaxes(title="Number of Books in NYT Best-Sellers list", )
fig.update_yaxes(title="Total Appearances in NYT Best-Sellers list")

# custom layout
fig.update_layout(
    font_family="Oswald",
    title_font_family="Oswald",
    title_font_size=20,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size=16)
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size=16)
    )
)
fig.show()