# GeoTaste Analysis 1

In [None]:
!pip install -qU folium
import folium
from folium.plugins import *
from functools import lru_cache as cache
import pandas as pd
import numpy as np
import warnings
from ipywidgets import *
warnings.filterwarnings('ignore')

In [None]:
@cache
def load_geotaste_df():
    ### SAVE TO GOOGLE DRIVE
    sheet='GEOTASTE_LINKED_DATA_20230202'

    from google.colab import auth
    import gspread
    from google.auth import default
    from gspread import SpreadsheetNotFound
    from gspread_dataframe import set_with_dataframe,get_as_dataframe

    auth.authenticate_user()
    creds, _ = default()
    gc = gspread.authorize(creds)

    try:
        # Open our new sheet and add some data.
        worksheet = gc.open(sheet).sheet1
    except SpreadsheetNotFound:
        sh = gc.create(sheet)
        worksheet = gc.open(sheet).sheet1


    df = get_as_dataframe(worksheet).dropna(0, 'all').dropna(1, 'all')
    # remove empty (NA) columns and rows
    df=df.fillna('')
    return df

In [None]:
@cache
def get_geotaste_df():
    df = load_geotaste_df()
    # filters
    df['book_id']=df.book_uri.apply(lambda x: x.split('/books/',1)[1][:-1])
    df['member_id']=df.member_uri.apply(lambda x: x.split('/members/',1)[1][:-1])
    
    # filter for year?
    df = df[df.start_date.apply(str).str[:4].str.isdigit()]
    df['start_year'] = df['start_date'].fillna('').apply(lambda x: pd.to_numeric(str(x)[:4]))
    df['start_dec'] = df['start_year'] // 10 * 10

    # must have coords
    df = df[df.coordinates!='']

    # quick function
    def get_first_coord(coords): return coords.split(';')[0]
    def get_lat(coord): return float(coord.split(',')[0]) if coord else np.nan
    def get_lon(coord): return float(coord.split(',')[1]) if coord else np.nan

    df['first_coordinates'] = df.coordinates.apply(get_first_coord)
    df['lat'] = df.first_coordinates.apply(get_lat)
    df['lon'] = df.first_coordinates.apply(get_lon)

    # valid coords
    df = df.loc[ df[['lat','lon']].dropna().index ]
    return df

In [None]:
# get data
DF = get_geotaste_df()

In [None]:
# choices
counts = DF.book_id.value_counts()
books = sorted(list(set(DF.book_id)), key=lambda x: -counts[x])
book_choice = Dropdown(options=['*'] + books)
# book_choice

In [None]:
# events
event_choice = Dropdown(options=['*'] + sorted(list(set(DF.event_type))))
# event_choice

In [None]:
# years
year_choice = Dropdown(options=['*'] + sorted(list(set(DF.start_dec))))
# year_choice

In [None]:
@interact
def get_figdf(book=book_choice, event=event_choice, year=year_choice):
    # start with large dataset
    figdf = DF

    # get book from dropdown
    if book!='*': figdf = figdf[figdf.book_id == book]

    # get event from dropdown
    if event!='*': figdf = figdf[figdf.event_type == event]

    # get year
    if year!='*': figdf = figdf[figdf.start_year == year]
    
    # show the data
    # figdf

    latlon = figdf[['lat','lon']]
    # take centroid from dataset as whole?
    centroid = DF[['lat','lon']].median()
    map = folium.Map(location=centroid, zoom_start=13, width='90%')
    hmap = HeatMap(latlon)
    hmap.add_to(map)
    display(map)
    return figdf

interactive(children=(Dropdown(description='book', options=('*', 'martin-new-statesman-nation', 'mackworth-tim…

In [None]:
DF.format.value_counts()

Book                 20633
Periodical             927
                       664
Phonograph Record        4
Article                  2
Prints                   1
Name: format, dtype: int64

In [None]:
df6=DF.loc[[i for i,row in DF[DF.format=='Book'].iterrows() if '6' in str(row.arrondissements) and '16' not in str(row.arrondissements)]]
df16=DF.loc[[i for i,row in DF[DF.format=='Book'].iterrows() if '16' in str(row.arrondissements)]]

In [None]:
df6.book_id.value_counts().head(10)

richardson-pointed-roofs              15
lawrence-women-love                   12
stein-autobiography-alice-b           12
joyce-dubliners                       12
mansfield-garden-party-stories        12
joyce-ulysses                         11
mansfield-bliss-short-stories         11
coyle-flock-birds                     10
douglas-south-wind                    10
trotsky-history-russian-revolution     9
Name: book_id, dtype: int64

In [None]:
df16.book_id.value_counts().head(10)

joyce-portrait-artist-young                16
richardson-pointed-roofs                   11
mansfield-garden-party-stories              9
lawrence-sons-lovers                        9
joyce-dubliners                             9
hemingway-winner-take-nothing               8
woolf-mrs-dalloway                          8
chaucer-complete-works-geoffrey-chaucer     8
woolf-jacobs-room                           8
stephens-crock-gold                         8
Name: book_id, dtype: int64

In [None]:
df16_minus_6 = df16.book_id.value_counts() - df6.book_id.value_counts()

In [None]:
df16_minus_6.dropna().sort_values()

coyle-flock-birds                         -9.0
trotsky-history-russian-revolution        -8.0
stein-autobiography-alice-b               -8.0
woolf-lighthouse                          -8.0
mansfield-bliss-short-stories             -7.0
                                          ... 
garnett-man-zoo                            4.0
proust-swanns-way                          4.0
stephens-crock-gold                        4.0
chaucer-complete-works-geoffrey-chaucer    7.0
joyce-portrait-artist-young                8.0
Name: book_id, Length: 1060, dtype: float64