In [3]:
import pandas as pd
import numpy as np
import re
import chart_studio.plotly as py
import cufflinks as cf
import seaborn as sns
import plotly.express as px
import requests
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.io as pio
%matplotlib inline

# Make Plotly work in your Jupyter Notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# Use Plotly locally
cf.go_offline()

df_test = pd.read_csv('big_data_books.csv')


def min_max(series):
    """ Returns the Min-Max normalised values for the average ratings value"""
    new_dseries = series.apply(lambda x: 1 + (((x-1.9)/3.09) * 9))
    return new_dseries


def norm_series(series):
    """ Returns the Normalised values for the Average rating values"""
    pd_series_mean = series.values.mean()
    normed = series.apply(lambda x: 1+((x-pd_series_mean)/3.09) * 9)
    return normed


def best_book(name):
    df_test = pd.read_csv('data_set_0.csv')
    filter = (df_test['Author'] == name)
    books_author = df_test[filter]
    book = books_author['Average_rating'] == books_author['Average_rating'].max()
    book_title = books_author[book]
    return book_title['Title']


pattern = re.compile(r'(\[|\]|\'|\")+')
pattern2 = re.compile(r'(\\n|...more)+')
#print(len(df_test['Awards']))
awards_no = []

for i, j in df_test['Awards'].items():
    if j is np.nan:
        j = np.nan
    else:
        j = re.sub(pattern, '', j)
        if len(j) > 5:
            j = re.sub(pattern2, ',', j).split(',')
            j = len(j)
        else:
            j = np.nan
    awards_no.append(j)


df_test['Awards_no'] = awards_no
#print(df_test['Awards_no'])

genres_l = []
genres_freq = {}
for i, j in df_test['Genres'].items():
    j = re.sub(pattern, '', j).split(',')
    for word in j:
        word = word.strip()
        if word in genres_freq:
            genres_freq[word] += 1
        else:
            genres_freq[word] = 1
    #print(j)
    genres_l.append(j)

df_test['Genres'] = genres_l
#print(genres_freq)

places = []
places_freq = {}
for i, j in df_test['Setting'].items():
    if j is np.nan:
        j = np.nan
    elif len(j) > 5:
        j = re.sub(pattern2, '', j)
        j = re.sub(pattern, '', j).strip('').split(',')
        for word in j:
            if word in places_freq:
                places_freq[word] += 1
            else:
                places_freq[word] = 1
    else:
        j = np.nan
    places.append(j)


df_test['Setting'] = places

new_df = df_test['Pages'].groupby(df_test['Year'])
new_df2 = df_test['Average_rating'].groupby(df_test['Year'])
new_df3 = df_test['Rating'].groupby(df_test['Year'])

df_new = pd.DataFrame(new_df.mean().reset_index(name='pages_mean'))
df_new2 = pd.DataFrame(new_df2.mean().reset_index(name='average_mean'))
df_new3 = pd.DataFrame(new_df3.mean().reset_index(name='average_rating_count'))
print(df_new3)

fig = px.line(df_new, x='Year', y= 'pages_mean', labels={'x': 'Year', 'y': 'Mean_pages'}, title='Average pages per Year')

fig.show()


      Year  average_rating_count
0   1998.0         246361.000000
1   1999.0          13350.000000
2   2000.0         103842.666667
3   2001.0         147298.055556
4   2002.0         155814.480000
5   2003.0         147864.275362
6   2004.0         156587.711340
7   2005.0         128644.613636
8   2006.0         225125.598039
9   2007.0         170989.019737
10  2008.0         206171.568000
11  2009.0         192507.675214
12  2010.0         254057.222222
13  2011.0         260469.125000
14  2012.0         154621.333333
15  2013.0         475209.500000
16  2014.0         168753.500000
17  2015.0         110706.666667
18  2016.0         415120.500000
19  2017.0         125829.666667
20  2018.0         640051.000000


In [4]:
df_new = pd.DataFrame(new_df.mean().reset_index(name='pages_mean'))
df_new2['average_mean'] = df_new2["average_mean"].apply(lambda x: (x/5)*1000)
df_new3['average_rating_count'] = df_new3["average_rating_count"].apply(lambda x: round((x/1000),2))
fig = px.line(df_new3, x='Year', y= 'average_rating_count', labels={'x': 'Year', 'y': 'Mean_pages'}, title='Average ratings count per year')

fig.show()

In [8]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_new['Year'],y=df_new['pages_mean'],
                            mode='lines',name='Avg_pages_per_year'))
fig.add_trace(go.Scatter(x=df_new['Year'],y=df_new2['average_mean'],
                            mode='lines+markers',name='Avg_rating_per year'))
fig.add_trace(go.Scatter(x=df_new['Year'],y=df_new3['average_rating_count'],
                            mode='lines+markers',name='Avg_rating_count_per_year',
                            line={'color':'firebrick','width':2,'dash':'dashdot'}))

fig.update_layout(title='Analysis on various fields in relation to the Year of release',
                  xaxis_title='Year')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a0867bd3-5624-476b-af8a-bd1f917bd510' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>