In [486]:
import os
import json
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import time
from functools import partial

In [866]:
class GeoBookJSONtoCSV:

    def __init__(self, data_dir = 'data/json/'):
        self.data_dir = data_dir
        self.data_files = os.listdir(self.data_dir) # assumes pure directory of just processed book jsons


    def run_all(self):

        self.get_book_df()
        self.geolocate_books()
        self.correct_book_df_errors()
        self.group_by_address()
        self.write_csv()
        
    def get_book_df(self):
        books_list = []
        for data in self.data_files:
            with open(data_dir + data, 'r') as f:
                book = json.load(f)
                books_list.append(book)
        df = pd.DataFrame(books_list)
        df.replace('NA', np.nan, inplace=True)
        df.replace('', np.nan, inplace=True)
        df.replace('None', np.nan, inplace=True)


        # make sure we can index on unique 'page' column
        try:
            len(df) == df.page.nunique()
        except:
            print('page column does not uniquely index rows! dropping rows')
            df = df.drop_duplicates('page')
            pass
        # generate city_country column

        df_city_country = df.loc[~(df.city.isna()) & ~(df.country.isna())] # city and country listed

        df_city_country = df_city_country.assign(city_country=np.nan) # create empty col
        df_city_country.loc[:,'city_country'] = df_city_country.loc[:,'city'] +', '+ df_city_country.loc[:,'country']
        #df_city_country['city_country'] = df_city_country['city'] +', '+ df_city_country['country']

        df_country = df.loc[(df.city.isna()) & ~(df.country.isna())] # country but no city listed
        df_country = df_country.assign(city_country=np.nan)
        df_city = df.loc[~(df.city.isna()) & (df.country.isna())] # city but no country listed
        df_city = df_city.assign(city_country=np.nan)

        self.df_list = [df_city_country, df_city, df_country]
        self.book_df = pd.concat(self.df_list)

        self.df_city_country = df_city_country
        self.df_city = df_city
        self.df_country = df_country

    def geolocate_books(self):
        """
        IMPORTANT: the geolocation call actually gives a location for None or np.nan lol. So, need to avoid passing those entries in!
        Replacing with nan where missing, we want to acquire the city_location, country_location, and city_country_location. 
        """
        geolocator = Nominatim(user_agent="geobooks", timeout=10)
        geocode = partial(geolocator.geocode, exactly_one=True, language="en", addressdetails=True) # defines language as english

        print(time.ctime())
        self.df_city_country = self.df_city_country.assign(city_location=None, country_location=None, city_country_location=None)
        print(time.ctime())
        self.df_city_country['city_location'] = self.df_city_country.apply(lambda x: geocode(x.city), axis=1)
        print(time.ctime())
        self.df_city_country['country_location'] = self.df_city_country.apply(lambda x: geocode(x.country), axis=1)
        print(time.ctime())
        self.df_city_country['city_country_location'] = self.df_city_country.apply(lambda x: geocode(x.city_country), axis=1)
        print(time.ctime())
        self.df_city = self.df_city.assign(city_location=None, country_location=None, city_country_location=None)
        print(time.ctime())
        self.df_city['city_location'] = self.df_city.apply(lambda x: geocode(x.city), axis=1)
        print(time.ctime())
        self.df_country = self.df_country.assign(city_location=None, country_location=None, city_country_location=None)
        print(time.ctime())
        self.df_country['country_location'] = self.df_country.apply(lambda x: geocode(x.country), axis=1)

        print(time.ctime())
        self.df_list = [self.df_city_country, self.df_city, self.df_country]
        self.book_df = pd.concat(self.df_list)

        self.book_df = self.select_best_location(self.book_df)
        # TODO: the other two for this df, and the other two dfs
        
        return 

    def correct_book_df_errors(self):
        """
        In this function we manually correct errors introduced through our pipeline which have been noticed and flagged.
        Just a way of doing this in code and tracking it--not ideal but it will work as a placeholder.
        Note: we have some duplicates because the same book was referenced under different page names on wikipedia
        """
        # drops a near duplicate of Dark Cloud
        self.book_df = self.book_df.loc[~((self.book_df.title=='Dark Cloud') & (self.book_df.author=='N/A'))] 
        # drops a near duplicate of Footsteps with an incorrect address
        self.book_df = self.book_df.loc[~((self.book_df.title=='Footsteps') & (self.book_df.address=='Batavia, Solano County, California, United States'))]
        # drops a near duplicate of The Known World with less accurate address
        self.book_df = self.book_df.loc[~((self.book_df.title=='The Known World') & (self.book_df.address=='United States'))]
        # drops a near duplicate of If Nobody Speaks of Remarkable Things with a less accurate address
        self.book_df = self.book_df.loc[~((self.book_df.title=='If Nobody Speaks of Remarkable Things') & (self.book_df.address=='United Kingdom'))]

        # drops a near duplicate of Marthandavarma with less accurate address
        self.book_df = self.book_df.loc[~((self.book_df.title=='Marthandavarma') & (self.book_df.city=='Travancore (now Trivandrum)'))]
        # drops a near duplicate of Marthandavarma with less accurate address
        self.book_df = self.book_df.loc[~((self.book_df.title=='Marthandavarma') & (self.book_df.city=='Travancore'))]


    def group_by_address(self):
        self.book_df_group = self.book_df.groupby(['geocoded_address','lat','lon'])['title'].apply(list).reset_index()
        self.book_df_group['title_str'] =self.book_df_group.apply(lambda x: [str(t) for t in x.title], axis=1) # sometimes titles are not strings
        self.book_df_group['titles'] = self.book_df_group.apply(lambda x:"<br>".join(x.title_str), axis=1)
        self.book_df_group = self.book_df_group[['geocoded_address','lat','lon','titles']]
        self.book_df_group = self.book_df_group.loc[~(self.book_df_group.geocoded_address=='')] # drops instances where no legitimate address found
        
        return

    def write_csv(self, csv_out_dir = 'data/csv/'):
        self.book_df.to_csv(csv_out_dir+'book_db.csv')
        self.book_df_group.to_csv(csv_out_dir+'books_per_coord_db.csv')

    def select_best_location(self, df):
        df = df.assign(best_location=None)
        df.loc[~df.city_country_location.isna(),'best_location'] =  df.loc[~df.city_country_location.isna(),'city_country_location']
        df.loc[(df.city_country_location.isna()) & (~df.city_location.isna()),'best_location'] = df.loc[(df.city_country_location.isna()) & (~df.city_location.isna()),'city_location']
        df.loc[(df.city_country_location.isna()) & (df.city_location.isna()) & (~df.country_location.isna()),'best_location'] = df.loc[(df.city_country_location.isna()) & (df.city_location.isna())  & (~df.country_location.isna()),'country_location']
    
    
        df = df.assign(lat=None, lon=None, address=None, geocoded_address=None, geocoded_country=None)
        #location.latitude, location.longitude
        df.loc[~df.best_location.isna(), 'lat'] = df.loc[~df.best_location.isna()].apply(lambda x: x.best_location.latitude, axis=1)
        df.loc[~df.best_location.isna(), 'lon'] = df.loc[~df.best_location.isna()].apply(lambda x: x.best_location.longitude, axis=1)
        df.loc[~df.best_location.isna(), 'address'] = df.loc[~df.best_location.isna()].apply(lambda x: x.best_location.address, axis=1)                                                                        
        df.loc[~df.best_location.isna(), 'geocoded_address'] = df.loc[~df.best_location.isna()].apply(lambda x: self.get_geocoded_address(x.best_location), axis=1)                                                                        
        df.loc[~df.best_location.isna(), 'geocoded_country'] = df.loc[~df.best_location.isna()].apply(lambda x: x.best_location.raw['address'].get('country'), axis=1)                                                                        

        return df  

    @staticmethod
    def get_geocoded_address(location):
        """
        function takes in a geolocator.geocode returned object with address_details=True
        returns an address that just has any subset of City, State, Province, or Country values 
        The .get returns None if the city key doesn't exist, and then the filter None drops it if None
        """
        geocoded_address = ', '.join(filter(None, [location.raw['address'].get('city'),location.raw['address'].get('province'), location.raw['address'].get('state'), location.raw['address'].get('country')]))
        return geocoded_address
        
    @staticmethod
    def jitter_duplicate_coords(df):
        df2 = df.copy(deep=True)
        df2['dupe_idx'] = df2.groupby(['lon']).cumcount() # 0 if no duplicates, if not indexes the duplicate
        #df['lon_og'] = df['lon']
        df2['lon'] = df2['lon'] + 0.01*df2['dupe_idx']
        return df2


        

In [867]:
obj = GeoBookJSONtoCSV( './../data/json/')

In [871]:
obj.get_book_df()


In [None]:
obj.geolocate_books()
obj.correct_book_df_errors()

Sun Aug 18 16:21:15 2024
Sun Aug 18 16:21:15 2024


In [862]:
df.loc[df.title=='The Lowland'].geocoded_country

418    United States
Name: geocoded_country, dtype: object

In [861]:
df.loc[df.title=='The Lowland'].best_location[418].raw['address'].get('country')

'United States'

In [837]:
df = obj.book_df

In [844]:
df.loc[~df.best_location.isna(), 'country'] 

1       United States
2              France
4              Brazil
5       United States
7      United Kingdom
            ...      
651           Hungary
654           Germany
663    United Kingdom
672             India
675             Haiti
Name: country, Length: 482, dtype: object

In [860]:
df.loc[~df.best_location.isna(), 'geocoded_country'] = df.loc[~df.best_location.isna()].apply(lambda x: x.best_location.raw['address'].get('country'), axis=1)                                                                        


In [856]:
df.loc[~df.best_location.isna()]

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location,best_location,lat,lon,address,geocoded_address,geocoded_country
1,The Mars Room,Rachel Kushner,San Francisco,United States,The Mars Room,"San Francisco, United States","(San Francisco, California, United States, (37...","(United States, (39.7837304, -100.445882))","(San Francisco, California, United States, (37...","(San Francisco, California, United States, (37...",37.779259,-122.419329,"San Francisco, California, United States","San Francisco, California, United States",
2,The Missing Italian Girl,Barbara Corrado Pope,Paris,France,The Missing Italian Girl,"Paris, France","(Paris, Ile-de-France, Metropolitan France, Fr...","(France, (46.603354, 1.8883335))","(Paris, Ile-de-France, Metropolitan France, Fr...","(Paris, Ile-de-France, Metropolitan France, Fr...",48.853495,2.348391,"Paris, Ile-de-France, Metropolitan France, France","Paris, Ile-de-France, France",
4,Brazil Red,Jean-Christophe Rufin,Rio de Janeiro,Brazil,Brazil Red,"Rio de Janeiro, Brazil","(Rio de Janeiro, Região Geográfica Imediata do...","(Brazil, (-10.3333333, -53.2))","(Rio de Janeiro, Região Geográfica Imediata do...","(Rio de Janeiro, Região Geográfica Imediata do...",-22.911014,-43.209373,"Rio de Janeiro, Região Geográfica Imediata do ...","Rio de Janeiro, Brazil",
5,New York: A Novel,Edward Rutherfurd,New York City,United States,New York (novel),"New York City, United States","(New York, United States, (40.7127281, -74.006...","(United States, (39.7837304, -100.445882))","(New York, United States, (40.7127281, -74.006...","(New York, United States, (40.7127281, -74.006...",40.712728,-74.006015,"New York, United States","New York, New York, United States",
7,According to Mark,Penelope Lively,London,United Kingdom,According to Mark,"London, England","(London, Greater London, England, United Kingd...","(England, United Kingdom, (52.5310214, -1.2649...","(London, Greater London, England, United Kingd...","(London, Greater London, England, United Kingd...",51.489334,-0.144055,"London, Greater London, England, United Kingdom","London, England, United Kingdom",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,Under the Frog,Tibor Fischer,,Hungary,Under the Frog,,,"(Hungary, (47.1817585, 19.5060937))",,"(Hungary, (47.1817585, 19.5060937))",47.181759,19.506094,Hungary,Hungary,
654,Lost Children Archive,Valeria Luiselli,,Germany,Lost Children Archive,,,"(U&S, 61, Bahnhofstraße, Wiesenviertel, Witten...",,"(U&S, 61, Bahnhofstraße, Wiesenviertel, Witten...",51.437611,7.330386,"U&S, 61, Bahnhofstraße, Wiesenviertel, Witten-...","North Rhine-Westphalia, Germany",
663,From Scenes Like These,Gordon Williams,,United Kingdom,From Scenes Like These,,,"(Scotland, United Kingdom, (56.7861112, -4.114...",,"(Scotland, United Kingdom, (56.7861112, -4.114...",56.786111,-4.114052,"Scotland, United Kingdom","Scotland, United Kingdom",
672,Aag Ka Darya,Qurratulain Hyder,,India,Aag Ka Darya,,,"(India, (22.3511148, 78.6677428))",,"(India, (22.3511148, 78.6677428))",22.351115,78.667743,India,India,


In [870]:
df.to_csv('./../data/csv/book_db.csv')

In [829]:
obj.book_df.loc[obj.book_df.title=='Dark Cloud']

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location,best_location,lat,lon,address,geocoded_address
603,Dark Cloud,,Tenochtítlan,Mexico,Aztec (book),"Tenochtítlan, Mexico","(Tenochtitlan, Durango, Municipio de Durango, ...","(Mexico, (23.6585116, -102.0077097))","(Tenochtitlan, Durango, Municipio de Durango, ...","(Tenochtitlan, Durango, Municipio de Durango, ...",24.076921,-104.584386,"Tenochtitlan, Durango, Municipio de Durango, D...","Durango, Durango, Mexico"


In [830]:
obj.book_df.to_csv('./../data/csv/book_db.csv')

In [831]:
obj.book_df.to_csv('./../data/csv/book_db_backup.csv')

In [832]:
obj.group_by_address()

In [833]:
n=220
obj.book_df_group.iloc[n:n+30]

Unnamed: 0,geocoded_address,lat,lon,titles
222,"Sicily, Italy",37.40894,13.732402,The Lady of the Wheel
223,"Sicily, Italy",37.587794,14.155048,Il Gattopardo
224,Solomon Islands,-8.705394,159.107069,The Islands of Unwisdom
225,South Africa,-28.816624,24.991639,The Covenant<br>An Instant in the Wind<br>In a...
226,Spain,39.326068,-4.837979,El Conde Lucanor<br>The Fencing Master<br>The ...
227,Sri Lanka,7.555494,80.713785,The Seven Moons of Maali Almeida
228,"Srinagar, Jammu and Kashmir, India",34.041102,74.87946,The Ministry of Utmost Happiness
229,"Srinagar, Jammu and Kashmir, India",34.074744,74.820444,Shalimar the Clown
230,"Surat, Gujarat, India",21.209489,72.831706,Karan Ghelo
231,Sweden,56.25,14.866667,The Emigrants


In [834]:
obj.book_df_group.to_csv('./../data/csv/books_per_coord_db.csv')

### Experimental Code

In [None]:
# run something to aggregate up to each unique coordinate

In [672]:
obj.book_df_group.loc[obj.book_df_group.geocoded_address=='Durango, Durango, Mexico']

Unnamed: 0,geocoded_address,lat,lon,titles
60,"Durango, Durango, Mexico",24.076921,-104.584386,Dark Cloud<br>Dark Cloud


In [692]:
obj.book_df.loc[(obj.book_df.title=='Footsteps') & (obj.book_df.address=='Batavia, Solano County, California, United States')]

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location,best_location,lat,lon,address,geocoded_address
280,Footsteps,Pramoedya Ananta Toer,Batavia (NA),Dutch East Indies (now Indonesia),Footsteps (novel),"Batavia (NA), Dutch East Indies (now Indonesia)","(Batavia, Solano County, California, United St...",,,"(Batavia, Solano County, California, United St...",38.406298,-121.859685,"Batavia, Solano County, California, United States","California, United States"


In [772]:
obj.book_df.columns

Index(['title', 'author', 'city', 'country', 'page', 'city_country',
       'city_location', 'country_location', 'city_country_location',
       'best_location', 'lat', 'lon', 'address', 'geocoded_address'],
      dtype='object')

In [823]:
obj.book_df.loc[(obj.book_df.title=='Lazarillo de Tormes')].best_location[454].raw

{'place_id': 259617609,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
 'osm_type': 'relation',
 'osm_id': 342999,
 'lat': '39.8560597',
 'lon': '-4.0239356',
 'class': 'boundary',
 'type': 'administrative',
 'place_rank': 16,
 'importance': 0.657462212032881,
 'addresstype': 'city',
 'name': 'Toledo',
 'display_name': 'Toledo, Castile-La Mancha, Spain',
 'address': {'city': 'Toledo',
  'province': 'Toledo',
  'ISO3166-2-lvl6': 'ES-TO',
  'state': 'Castile-La Mancha',
  'ISO3166-2-lvl4': 'ES-CM',
  'country': 'Spain',
  'country_code': 'es'},
 'boundingbox': ['39.8120405', '39.9254339', '-4.1790592', '-3.8147929']}

In [807]:
obj.book_df.loc[(obj.book_df.title=='The English Patient')].best_location[94].raw

{'place_id': 71120942,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 866085534,
 'lat': '41.36259895',
 'lon': '2.1596649904253766',
 'class': 'amenity',
 'type': 'garden',
 'place_rank': 30,
 'importance': 9.99999999995449e-06,
 'addresstype': 'amenity',
 'name': 'North Africa',
 'display_name': 'North Africa, Carrer de Can Valero, Montjuïc, el Poble-sec, Sants-Montjuïc, Barcelona, Barcelonès, Barcelona, Catalonia, 08001, Spain',
 'address': {'amenity': 'North Africa',
  'road': 'Carrer de Can Valero',
  'quarter': 'Montjuïc',
  'suburb': 'Sants-Montjuïc',
  'city': 'Barcelona',
  'county': 'Barcelonès',
  'province': 'Barcelona',
  'ISO3166-2-lvl6': 'ES-B',
  'state': 'Catalonia',
  'ISO3166-2-lvl4': 'ES-CT',
  'postcode': '08001',
  'country': 'Spain',
  'country_code': 'es'},
 'boundingbox': ['41.3621064', '41.3630721', '2.1586385', '2.1604020']}

In [781]:
obj.book_df.loc[obj.book_df.country=='Japan']

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location,best_location,lat,lon,address,geocoded_address
124,Gai-Jin,James Clavell,Hiroshima,Japan,Gai-Jin (novel),"Hiroshima, Japan","(Hiroshima, Hiroshima Prefecture, Japan, (34.3...","(Japan, (36.5748441, 139.2394179))","(Hiroshima, Hiroshima Prefecture, Japan, (34.3...","(Hiroshima, Hiroshima Prefecture, Japan, (34.3...",34.391724,132.451759,"Hiroshima, Hiroshima Prefecture, Japan","Hiroshima, Japan"
164,The Thousand Autumns of Jacob de Zoet,David Mitchell,Nagasaki,Japan,The Thousand Autumns of Jacob de Zoet,"Nagasaki, Japan","(Nagasaki Prefecture, Japan, (33.1154683, 129....","(Japan, (36.5748441, 139.2394179))","(Nagasaki Prefecture, Japan, (33.1154683, 129....","(Nagasaki Prefecture, Japan, (33.1154683, 129....",33.115468,129.787434,"Nagasaki Prefecture, Japan",Japan
215,Silk,Alessandro Baricco,Edo Bay,Japan,Silk (novel),"Edo Bay, Japan","(東京湾, Kawasaki Ward, Kawasaki, Kanagawa Prefec...","(Japan, (36.5748441, 139.2394179))","(東京湾, Kawasaki Ward, Kawasaki, Kanagawa Prefec...","(東京湾, Kawasaki Ward, Kawasaki, Kanagawa Prefec...",35.4169,139.771,"東京湾, Kawasaki Ward, Kawasaki, Kanagawa Prefect...","Kawasaki, Japan"
635,A Tale for the Time Being,Ruth Ozeki,Tokyo,Japan,A Tale for the Time Being,"Tokyo, Japan","(Tokyo, Japan, (35.6821936, 139.762221))","(Japan, (36.5748441, 139.2394179))","(Tokyo, Japan, (35.6821936, 139.762221))","(Tokyo, Japan, (35.6821936, 139.762221))",35.682194,139.762221,"Tokyo, Japan",Japan
105,Shōgun,James Clavell,,Japan,Shōgun (novel),,,"(Japan, (36.5748441, 139.2394179))",,"(Japan, (36.5748441, 139.2394179))",36.574844,139.239418,Japan,Japan
192,Silence,Shūsaku Endō,,Japan,Silence (Endō novel),,,"(Japan, (36.5748441, 139.2394179))",,"(Japan, (36.5748441, 139.2394179))",36.574844,139.239418,Japan,Japan
267,The Tale of the Heike,Unknown (Wikipedia article about a book),,Japan,The Tale of the Heike,,,"(Japan, (36.5748441, 139.2394179))",,"(Japan, (36.5748441, 139.2394179))",36.574844,139.239418,Japan,Japan
309,An Artist of the Floating World,Kazuo Ishiguro,,Japan,An Artist of the Floating World,,,"(Japan, (36.5748441, 139.2394179))",,"(Japan, (36.5748441, 139.2394179))",36.574844,139.239418,Japan,Japan
405,Musashi,Eiji Yoshikawa,,Japan,Musashi (novel),,,"(Japan, (36.5748441, 139.2394179))",,"(Japan, (36.5748441, 139.2394179))",36.574844,139.239418,Japan,Japan


In [716]:
obj.book_df[obj.book_df.duplicated(subset=['title'], keep=False)].sort_values('title')[['title','author','city']]

Unnamed: 0,title,author,city


In [700]:
obj.book_df[obj.book_df.duplicated(subset=['title'], keep=False)].sort_values('title')[['title','author','address']].address[628]

'Travancore Medical College Hospital, Salem - Kochi - Kanyakumari Road, Mevaram, Kollam, Kerala, 691589, India'

In [686]:
obj.book_df_group.

Unnamed: 0,geocoded_address,lat,lon,titles
2,"Acapulco, Guerrero, Mexico",16.868050,-99.894018,The Samurai
3,"Adelaide, South Australia, Australia",-34.928181,138.599931,Slow Man
4,Afghanistan,33.768006,66.238514,The Afghan Campaign
5,"Alaska, United States",64.497510,-165.406170,Alaska
6,"Alexandria, Alexandria, Egypt",31.199181,29.895172,Hypatia
...,...,...,...,...
272,"West Bengal, India",27.070287,88.472368,The Inheritance of Loss
273,"West Flanders, Belgium",51.251498,3.281482,Thyl Ulenspiegel
274,"Western, Solomon Islands",-9.050000,158.750000,The Ghost Road
275,"Yukon, Canada",64.060660,-139.431695,Journey


In [685]:
obj.book_df[obj.book_df.duplicated(subset=['title'], keep=False)].sort_values('title')

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location,best_location,lat,lon,address,geocoded_address
439,Dark Cloud,,Tenochtítlan,Mexico,Aztec (novel),"Tenochtítlan, Mexico","(Tenochtitlan, Durango, Municipio de Durango, ...","(Mexico, (23.6585116, -102.0077097))","(Tenochtitlan, Durango, Municipio de Durango, ...","(Tenochtitlan, Durango, Municipio de Durango, ...",24.076921,-104.584386,"Tenochtitlan, Durango, Municipio de Durango, D...","Durango, Durango, Mexico"
603,Dark Cloud,,Tenochtítlan,Mexico,Aztec (book),"Tenochtítlan, Mexico","(Tenochtitlan, Durango, Municipio de Durango, ...","(Mexico, (23.6585116, -102.0077097))","(Tenochtitlan, Durango, Municipio de Durango, ...","(Tenochtitlan, Durango, Municipio de Durango, ...",24.076921,-104.584386,"Tenochtitlan, Durango, Municipio de Durango, D...","Durango, Durango, Mexico"
280,Footsteps,Pramoedya Ananta Toer,Batavia (NA),Dutch East Indies (now Indonesia),Footsteps (novel),"Batavia (NA), Dutch East Indies (now Indonesia)","(Batavia, Solano County, California, United St...",,,"(Batavia, Solano County, California, United St...",38.406298,-121.859685,"Batavia, Solano County, California, United States","California, United States"
666,Footsteps,Pramoedya Ananta Toer,Batavia (today Indonesia),Indonesia,Jejak Langkah,"Batavia (today Indonesia), Indonesia",,"(Indonesia, (-2.4833826, 117.8902853))",,"(Indonesia, (-2.4833826, 117.8902853))",-2.483383,117.890285,Indonesia,Indonesia
46,If Nobody Speaks of Remarkable Things,Jon McGregor,Bradford,,If Nobody Speaks of Remarkable Things,,"(Bradford, West Yorkshire, England, United Kin...",,,"(Bradford, West Yorkshire, England, United Kin...",53.794423,-1.751919,"Bradford, West Yorkshire, England, United Kingdom","Bradford, England, United Kingdom"
320,If Nobody Speaks of Remarkable Things,Jon McGregor,,UK,Reservoir 13,,,"(United Kingdom, (54.7023545, -3.2765753))",,"(United Kingdom, (54.7023545, -3.2765753))",54.702354,-3.276575,United Kingdom,United Kingdom
544,Marthandavarma,C. V. Raman Pillai,Travancore (Thiruvananthapuram),India,Ramarajabahadur (novel),"Travancore (Thiruvananthapuram), India","(Mall of Travancore, Eanchakkal, Thiruvanantha...","(India, (22.3511148, 78.6677428))","(Travancore Titanium Products (TTP), ISRO Road...","(Travancore Titanium Products (TTP), ISRO Road...",8.499807,76.89964,"Travancore Titanium Products (TTP), ISRO Road ...","Thiruvananthapuram, Kerala, India"
560,Marthandavarma,C. V. Raman Pillai,Travancore (now Trivandrum),India,Dharmaraja (novel),"Travancore (now Trivandrum), India",,"(India, (22.3511148, 78.6677428))",,"(India, (22.3511148, 78.6677428))",22.351115,78.667743,India,India
628,Marthandavarma,C. V. Raman Pillai,Travancore,India,Marthandavarma (novel),"Travancore, India","(Travancore, Melbourne, City of Moonee Valley,...","(India, (22.3511148, 78.6677428))","(Travancore Medical College Hospital, Salem - ...","(Travancore Medical College Hospital, Salem - ...",8.87503,76.644859,"Travancore Medical College Hospital, Salem - K...","Kollam, Kerala, India"
243,The Known World,Edward P. Jones,Virginia,United States,The Known World,"Virginia, United States","(Virginia, United States, (37.1232245, -78.492...","(United States, (39.7837304, -100.445882))","(Virginia, United States, (37.1232245, -78.492...","(Virginia, United States, (37.1232245, -78.492...",37.123224,-78.492772,"Virginia, United States","Virginia, United States"


In [683]:
obj.book_df[obj.book_df.duplicated(subset=['title'], keep=False)].sort_values('title')[['title','author','address']]

Unnamed: 0,title,author,address
439,Dark Cloud,,"Tenochtitlan, Durango, Municipio de Durango, D..."
603,Dark Cloud,,"Tenochtitlan, Durango, Municipio de Durango, D..."
280,Footsteps,Pramoedya Ananta Toer,"Batavia, Solano County, California, United States"
666,Footsteps,Pramoedya Ananta Toer,Indonesia
46,If Nobody Speaks of Remarkable Things,Jon McGregor,"Bradford, West Yorkshire, England, United Kingdom"
320,If Nobody Speaks of Remarkable Things,Jon McGregor,United Kingdom
544,Marthandavarma,C. V. Raman Pillai,"Travancore Titanium Products (TTP), ISRO Road ..."
560,Marthandavarma,C. V. Raman Pillai,India
628,Marthandavarma,C. V. Raman Pillai,"Travancore Medical College Hospital, Salem - K..."
243,The Known World,Edward P. Jones,"Virginia, United States"


In [681]:
obj2 = GeoBookJSONtoCSV( './../data/json/')
obj2.get_book_df()
obj2.book_df.loc[obj2.book_df.title=='Dark Cloud']

Unnamed: 0,title,author,city,country,page,city_country
439,Dark Cloud,,Tenochtítlan,Mexico,Aztec (novel),"Tenochtítlan, Mexico"
603,Dark Cloud,,Tenochtítlan,Mexico,Aztec (book),"Tenochtítlan, Mexico"


In [610]:
obj.book_df.geocoded_address

1      San Francisco, United States
2                     Paris, France
4                            Brazil
5           New York, United States
7            London, United Kingdom
                   ...             
651                         Hungary
654                         Germany
663                  United Kingdom
672                           India
675                           Haiti
Name: geocoded_address, Length: 496, dtype: object

In [637]:
def group_by_address(df):
    df_group = df.groupby(['geocoded_address','lat','lon'])['title'].apply(list).reset_index()
    df_group['title_str'] =df_group.apply(lambda x: [str(t) for t in x.title], axis=1) # sometimes titles are not strings
    df_group['titles'] = df_group.apply(lambda x:"<br>".join(x.title_str), axis=1)
    df_group = df_group.loc[~(df_group.geocoded_address=='')] # drops instances where no legitimate address found

    return df_group[['geocoded_address','lat','lon','titles']]

In [638]:
df1 = group_by_address(obj.book_df)

In [643]:
df1.loc[df1.geocoded_address=='Mexico']


Unnamed: 0,geocoded_address,lat,lon,titles
148,Mexico,17.999929,-92.668166,The Power and the Glory
149,Mexico,23.658512,-102.00771,Hunger's Brides<br>Cartucho<br>Caballero: A Hi...
150,Mexico,24.800798,-104.464406,Mexico


In [657]:
obj.book_df.loc[obj.book_df.geocoded_address=='Germany'][['title','best_location','lat','lon','address','geocoded_address']]

Unnamed: 0,title,best_location,lat,lon,address,geocoded_address
73,The Book Thief,"(Görlitz, Saxony, Germany, (51.1563185, 14.991...",51.156318,14.991018,"Görlitz, Saxony, Germany",Germany
159,The Element of Water,"(Lake House Plön, 1, Fegetasche, Plön, Schlesw...",54.153189,10.450517,"Lake House Plön, 1, Fegetasche, Plön, Schleswi...",Germany
358,Come Rack! Come Rope!,"(England, Buch am Erlbach, Landkreis Landshut,...",48.428501,12.056571,"England, Buch am Erlbach, Landkreis Landshut, ...",Germany
654,Lost Children Archive,"(U&S, 61, Bahnhofstraße, Wiesenviertel, Witten...",51.437611,7.330386,"U&S, 61, Bahnhofstraße, Wiesenviertel, Witten-...",Germany


In [659]:
obj.book_df.loc[obj.book_df.geocoded_address=='Germany'][['title','best_location','lat','lon','address','geocoded_address']].best_location[159].raw['address']


{'tourism': 'Lake House Plön',
 'house_number': '1',
 'road': 'Fegetasche',
 'neighbourhood': 'Fegetasche',
 'town': 'Plön',
 'county': 'Plön',
 'state': 'Schleswig-Holstein',
 'ISO3166-2-lvl4': 'DE-SH',
 'postcode': '24306',
 'country': 'Germany',
 'country_code': 'de'}

In [645]:
obj.book_df.loc[obj.book_df.geocoded_address=='Mexico'][['title','best_location','lat','lon','address','geocoded_address']]

Unnamed: 0,title,best_location,lat,lon,address,geocoded_address
41,Mexico,"(Toledo, San Juan del Río, Durango, 34480, Mex...",24.800798,-104.464406,"Toledo, San Juan del Río, Durango, 34480, Mexico",Mexico
402,The Power and the Glory,"(Tabasco, Mexico, (17.9999288, -92.6681659))",17.999929,-92.668166,"Tabasco, Mexico",Mexico
114,Hunger's Brides,"(Mexico, (23.6585116, -102.0077097))",23.658512,-102.00771,Mexico,Mexico
160,Cartucho,"(Mexico, (23.6585116, -102.0077097))",23.658512,-102.00771,Mexico,Mexico
592,Caballero: A Historical Novel,"(Mexico, (23.6585116, -102.0077097))",23.658512,-102.00771,Mexico,Mexico


In [652]:
obj.book_df.loc[obj.book_df.geocoded_address=='Mexico'][['title','best_location','lat','lon','address','geocoded_address']].best_location[41].raw['address']



{'village': 'Toledo',
 'county': 'San Juan del Río',
 'state': 'Durango',
 'ISO3166-2-lvl4': 'MX-DUR',
 'postcode': '34480',
 'country': 'Mexico',
 'country_code': 'mx'}

In [None]:
obj.book_df.loc[obj.book_df.geocoded_address=='Mexico'][['title','best_location','lat','lon','address','geocoded_address']].best_location[41].raw['address']


In [654]:
obj.book_df.loc[obj.book_df.geocoded_address=='Mexico'][['title','best_location','lat','lon','address','geocoded_address']].best_location[402].raw['address']


{'state': 'Tabasco',
 'ISO3166-2-lvl4': 'MX-TAB',
 'country': 'Mexico',
 'country_code': 'mx'}

In [655]:
obj.book_df.loc[obj.book_df.geocoded_address=='Mexico'][['title','best_location','lat','lon','address','geocoded_address']].best_location[114].raw['address']


{'country': 'Mexico', 'country_code': 'mx'}

In [623]:
obj.book_df.loc[obj.book_df.title=='Royal Flash']

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location,best_location,lat,lon,address,geocoded_address
379,Royal Flash,George MacDonald Fraser,,Asia,Flashman (novel),,,"(Asia, (51.2086975, 89.2343748))",,"(Asia, (51.2086975, 89.2343748))",51.208697,89.234375,Asia,


In [641]:
df1.to_csv('./../data/csv/books_per_coord_db.csv')

In [640]:
df1

Unnamed: 0,geocoded_address,lat,lon,titles
2,"Acapulco, Mexico",16.868050,-99.894018,The Samurai
3,"Adelaide, Australia",-34.928181,138.599931,Slow Man
4,Afghanistan,33.768006,66.238514,The Afghan Campaign
5,"Alexandria, Egypt",31.199181,29.895172,Hypatia
6,"Algeciras, Spain",36.131172,-5.447399,Night Boat to Tangier
...,...,...,...,...
272,Vietnam,15.926666,107.965086,The Lotus Eaters<br>The Sorrow of War
273,"Volgograd, Russia",48.708191,44.515335,Life and Fate
274,"Warsaw, Poland",52.231958,21.006725,The Faithful River<br>Push Not the River<br>Ka...
275,"Washington, United States",38.895037,-77.036543,Unburnable<br>Lincoln


In [631]:
df1.loc[df1.titles=='The Baroque Cycle'].geocoded_address==''

0    True
Name: geocoded_address, dtype: bool

In [524]:
df1.to_csv('./../data/csv/books_per_coord_db.csv')

In [620]:
df1.loc[df1.address=='Mexico'].titles

AttributeError: 'DataFrame' object has no attribute 'address'

In [519]:
df1.iloc[0:10]

Unnamed: 0,address,lat,lon,titles
0,"Abergwyngregyn, Gwynedd, Wales, LL33 0LF, Unit...",53.234242,-4.014731,The Reckoning
1,"Acapulco, Acapulco de Juárez, Guerrero, 39300,...",16.86805,-99.894018,The Samurai
2,"Adelaide, Adelaide City Council, South Austral...",-34.928181,138.599931,Slow Man
3,Afghanistan,33.768006,66.238514,The Afghan Campaign
4,"Agincourt, Nancy, Meurthe-et-Moselle, Grand Es...",48.732347,6.236384,Know Ye Not Agincourt?
5,"Alexandria, 21519, Egypt",31.199181,29.895172,Hypatia
6,"Algeciras, Campo de Gibraltar, Cádiz, Andalusi...",36.131172,-5.447399,Night Boat to Tangier
7,"Amsterdam, North Holland, Netherlands",52.37308,4.892453,The Miniaturist\nThe Coffee Trader
8,"Andersonville, Sumter County, Georgia, 31711, ...",32.195995,-84.139909,Andersonville
9,"Annapolis, Anne Arundel County, Maryland, Unit...",38.97864,-76.492786,Richard Carvel


In [468]:
df3=pd.DataFrame(obj.book_df.groupby(['address','lat','lon'])['title'].apply(list)).reset_index()

In [472]:
df3

Unnamed: 0,address,lat,lon,title
0,"Abergwyngregyn, Gwynedd, Cymru / Wales, LL33 0...",53.234242,-4.014731,[The Reckoning]
1,"Acapulco, Acapulco de Juárez, Guerrero, 39300,...",16.868050,-99.894018,[The Samurai]
2,"Adelaide, Adelaide City Council, South Austral...",-34.928181,138.599931,[Slow Man]
3,"Agincourt, Nancy, Meurthe-et-Moselle, Grand Es...",48.732347,6.236384,[Know Ye Not Agincourt?]
4,"Alba / Scotland, United Kingdom",56.786111,-4.114052,"[The Testament of Gideon Mack, Grace Notes, Re..."
...,...,...,...,...
269,日本,36.574844,139.239418,"[Shōgun, Silence, The Tale of the Heike, An Ar..."
270,"東京湾, 川崎区, 川崎市, 神奈川県, 日本",35.416900,139.771000,[Silk]
271,"東京都, 日本",35.682194,139.762221,[A Tale for the Time Being]
272,"長崎県, 日本",33.115468,129.787434,[The Thousand Autumns of Jacob de Zoet]


In [482]:
df3['title_str'] =df3.apply(lambda x: [str(t) for t in x.title], axis=1)

In [484]:
df3['titles'] = df3.apply(lambda x:" ".join(x.title_str), axis=1)

In [485]:
df3

Unnamed: 0,address,lat,lon,title,title_str,titles
0,"Abergwyngregyn, Gwynedd, Cymru / Wales, LL33 0...",53.234242,-4.014731,[The Reckoning],[The Reckoning],The Reckoning
1,"Acapulco, Acapulco de Juárez, Guerrero, 39300,...",16.868050,-99.894018,[The Samurai],[The Samurai],The Samurai
2,"Adelaide, Adelaide City Council, South Austral...",-34.928181,138.599931,[Slow Man],[Slow Man],Slow Man
3,"Agincourt, Nancy, Meurthe-et-Moselle, Grand Es...",48.732347,6.236384,[Know Ye Not Agincourt?],[Know Ye Not Agincourt?],Know Ye Not Agincourt?
4,"Alba / Scotland, United Kingdom",56.786111,-4.114052,"[The Testament of Gideon Mack, Grace Notes, Re...","[The Testament of Gideon Mack, Grace Notes, Re...",The Testament of Gideon Mack Grace Notes Redga...
...,...,...,...,...,...,...
269,日本,36.574844,139.239418,"[Shōgun, Silence, The Tale of the Heike, An Ar...","[Shōgun, Silence, The Tale of the Heike, An Ar...",Shōgun Silence The Tale of the Heike An Artist...
270,"東京湾, 川崎区, 川崎市, 神奈川県, 日本",35.416900,139.771000,[Silk],[Silk],Silk
271,"東京都, 日本",35.682194,139.762221,[A Tale for the Time Being],[A Tale for the Time Being],A Tale for the Time Being
272,"長崎県, 日本",33.115468,129.787434,[The Thousand Autumns of Jacob de Zoet],[The Thousand Autumns of Jacob de Zoet],The Thousand Autumns of Jacob de Zoet


In [465]:
tl = ['a','b','c']

print("\n".join(tl))

a
b
c


In [440]:
obj.book_df.loc[obj.book_df.city=='Paris'][['title','lon']]

Unnamed: 0,title,lon
2,The Missing Italian Girl,2.348391
191,La Reine Margot,2.348391
271,Scaramouche,2.348391
343,The Four Horsemen of The Apocalypse,2.348391
425,An Officer and a Spy,2.348391
444,The Lymond Chronicles,2.348391
464,The Prague Cemetery,2.348391
597,The Hunchback of Notre Dame,2.348391
354,The Count of Monte Cristo,2.348391


In [446]:
def jitter_duplicate_coords(df):
    df2 = df.copy(deep=True)
    df2['dupe_idx'] = df2.groupby(['lon']).cumcount() # 0 if no duplicates, if not indexes the duplicate
    #df['lon_og'] = df['lon']
    df2['lon'] = df2['lon'] + 0.01*df2['dupe_idx']
    return df2

In [447]:
df_2 = jitter_duplicate_coords(obj.book_df)

In [448]:
df_2.to_csv('./../data/csv/book_db.csv')

In [449]:
df_2.loc[df_2.city=='Paris'][['title','lon']]

Unnamed: 0,title,lon
2,The Missing Italian Girl,2.348391
191,La Reine Margot,2.358391
271,Scaramouche,2.368391
343,The Four Horsemen of The Apocalypse,2.378391
425,An Officer and a Spy,2.388392
444,The Lymond Chronicles,2.398391
464,The Prague Cemetery,2.408392
597,The Hunchback of Notre Dame,2.418391
354,The Count of Monte Cristo,2.428392


In [417]:
obj.book_df = df_old

In [432]:
df_old = pd.read_csv('./../data/csv/book_db_backup.csv')

In [433]:
df_old.loc[df_old.city=='Paris'][['title','lon','lat','address','city','country','city_country']]

Unnamed: 0,title,lon,lat,address,city,country,city_country
1,The Missing Italian Girl,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,France,"Paris, France"
86,La Reine Margot,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,France,"Paris, France"
120,Scaramouche,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,France,"Paris, France"
152,The Four Horsemen of The Apocalypse,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,France,"Paris, France"
195,An Officer and a Spy,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,France,"Paris, France"
204,The Lymond Chronicles,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,France,"Paris, France"
214,The Prague Cemetery,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,France,"Paris, France"
275,The Hunchback of Notre Dame,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,France,"Paris, France"
332,The Count of Monte Cristo,2.348391,48.853495,"Paris, Île-de-France, France métropolitaine, F...",Paris,,


In [393]:
df_old['lon'] = df_old['lon_og']

In [401]:
obj.book_df[['lon','lon_og','country']].sort_values('country')

Unnamed: 0,lon,lon_og,country
284,69.177684,69.177684,Afghanistan
361,66.238514,66.238514,Afghanistan
114,69.177684,69.177684,Afghanistan
198,-58.443285,-58.443285,Argentina
470,-64.967282,-64.967282,Argentina
...,...,...,...
342,20.587999,20.587999,
343,-0.144055,-0.144055,
344,-4.114052,-4.114052,
345,-0.144055,-0.144055,


In [394]:
df_old.loc[df_old.country=='India'][['title','city', 'lat', 'lon']]

Unnamed: 0,title,city,lat,lon
40,Shalimar the Clown,Srinagar,34.074744,74.820444
42,Sea of Poppies,Calcutta,22.572646,88.363895
61,The Best of Times,Bombay,19.081577,72.886628
73,The Inheritance of Loss,Kalimpong,27.070287,88.472368
96,Veyi Padagalu,Subbannapeta,18.020226,83.548237
102,Durgeshnandini,"Mandaran (Hooghly district, West Bengal)",22.87888,87.664153
110,Clear Light of Day,Delhi,28.627393,77.171695
117,Sleeping on Jupiter,Jarmuli,22.351115,78.667743
126,Animal's People,Bhopal,23.258486,77.401989
141,The Moor's Last Sigh,Bombay and Cochin,22.351115,78.667743


In [395]:
obj.book_df = df_old

In [390]:
obj.book_df.loc[obj.book_df.country=='India'][['title','city', 'lat', 'lon']]

Unnamed: 0,title,city,lat,lon
87,Shalimar the Clown,Srinagar,34.074744,20.75296
98,Sea of Poppies,Calcutta,22.572646,-0.144055
145,The Best of Times,Bombay,19.081577,12.482932
166,The Inheritance of Loss,Kalimpong,27.070287,19.294097
218,Veyi Padagalu,Subbannapeta,18.020226,7.053979
229,Durgeshnandini,"Mandaran (Hooghly district, West Bengal)",22.87888,78.667743
241,Clear Light of Day,Delhi,28.627393,-4.114052
257,Sleeping on Jupiter,Jarmuli,22.351115,7.268391
284,Animal's People,Bhopal,23.258486,69.177684
321,The Moor's Last Sigh,Bombay and Cochin,22.351115,-0.112738


In [383]:
df_2 = jitter_duplicate_coords(obj.book_df)

In [384]:
df_2.loc[df_2.country=='India'][['title','city', 'lat', 'lon']]

Unnamed: 0,title,city,lat,lon
87,Shalimar the Clown,Srinagar,34.074744,74.820444
98,Sea of Poppies,Calcutta,22.572646,88.363895
145,The Best of Times,Bombay,19.081577,72.886628
166,The Inheritance of Loss,Kalimpong,27.070287,88.472368
218,Veyi Padagalu,Subbannapeta,18.020226,83.548237
229,Durgeshnandini,"Mandaran (Hooghly district, West Bengal)",22.87888,87.664153
241,Clear Light of Day,Delhi,28.627393,77.171695
257,Sleeping on Jupiter,Jarmuli,22.351115,78.667743
284,Animal's People,Bhopal,23.258486,77.401989
321,The Moor's Last Sigh,Bombay and Cochin,22.351115,78.667744


In [378]:
df_2.to_csv('./../data/csv/book_db.csv')

In [373]:
data = {'value': [0.3, 0.3, 0.3, 0.2,0.6, 0.6]}
df = pd.DataFrame(data)
df = df.assign(dupe=None)
df.dupe = df.value
df["POS"] = df.groupby(['value']).cumcount()
df['p_value'] = df['value'] + df.POS*0.00001
df
#result = df.groupby('value')['dupe'].transform(lambda x: x +0.0001*POS))


Unnamed: 0,value,dupe,POS,p_value
0,0.3,0.3,0,0.3
1,0.3,0.3,1,0.30001
2,0.3,0.3,2,0.30002
3,0.2,0.2,0,0.2
4,0.6,0.6,0,0.6
5,0.6,0.6,1,0.60001


In [365]:
result

0    0.30001
1    0.30002
2    0.30003
3    0.60004
4    0.60005
Name: dupe, dtype: float64

In [321]:
tdf2.loc[(tdf2.city.isna())]

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location,best_location,lat,lon,address
3,The Famished Road,Ben Okri,,Nigeria,The Famished Road,,,,,,,,
6,Bring Larks and Heroes,Thomas Keneally,,Australia,Bring Larks and Heroes,,,,,,,,
10,The Colour of Blood,Brian Moore,,Poland,The Colour of Blood,,,,,,,,
14,The Last Empress,Anchee Min,,China,The Last Empress (novel),,,,,,,,
15,"Fasting, Feasting",Anita Desai,,India,"Fasting, Feasting",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,Under the Frog,Tibor Fischer,,Hungary,Under the Frog,,,,,,,,
654,Lost Children Archive,Valeria Luiselli,,U.S.,Lost Children Archive,,,,,,,,
663,From Scenes Like These,Gordon Williams,,Scotland,From Scenes Like These,,,,,,,,
672,Aag Ka Darya,Qurratulain Hyder,,India,Aag Ka Darya,,,,,,,,


In [286]:
tdf.

[Location(Newburyport, Essex County, Massachusetts, 01950, United States, (42.8132845, -70.8819188, 0.0)),
 Location(Venezia, Veneto, 30121-30176, Italia, (45.4371908, 12.3345898, 0.0)),
 Location(Berwick-upon-Tweed, Northumberland, North of Tyne, England, United Kingdom, (55.7692442, -2.0026472, 0.0)),
 Location(Bradford, West Yorkshire, England, United Kingdom, (53.7944229, -1.7519186, 0.0)),
 Location(London, Greater London, England, United Kingdom, (51.5074456, -0.1277653, 0.0)),
 Location(Cardiff, Cymru / Wales, CF10 2AF, United Kingdom, (51.4816546, -3.1791934, 0.0)),
 None,
 Location(Piccadilly Circus, Saint Giles, Bloomsbury, City of Westminster, Greater London, England, W1J 9HT, United Kingdom, (51.5097922, -0.13442883634595293, 0.0)),
 Location(London, Greater London, England, United Kingdom, (51.5074456, -0.1277653, 0.0)),
 Location(Palmers Green, London Borough of Enfield, London, Greater London, England, N13 4PX, United Kingdom, (51.6222544, -0.1127378, 0.0)),
 Location(Cu

In [287]:
tdf.iloc[0:20]

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location,best_location
0,"Ducks, Newburyport",,Newburyport,,"Ducks, Newburyport",,"(Newburyport, Essex County, Massachusetts, 019...",,,"(Newburyport, Essex County, Massachusetts, 019..."
20,The Comfort of Strangers,Ian McEwan,Venice,,The Comfort of Strangers,,"(Venezia, Veneto, 30121-30176, Italia, (45.437...",,,"(Venezia, Veneto, 30121-30176, Italia, (45.437..."
29,The Unlikely Pilgrimage of Harold Fry,Rachel Joyce,Berwick,,The Unlikely Pilgrimage of Harold Fry,,"(Berwick-upon-Tweed, Northumberland, North of ...",,,"(Berwick-upon-Tweed, Northumberland, North of ..."
46,If Nobody Speaks of Remarkable Things,Jon McGregor,Bradford,,If Nobody Speaks of Remarkable Things,,"(Bradford, West Yorkshire, England, United Kin...",,,"(Bradford, West Yorkshire, England, United Kin..."
68,The Birds on the Trees,Nina Bawden,London,,The Birds on the Trees,,"(London, Greater London, England, United Kingd...",,,"(London, Greater London, England, United Kingd..."
90,The Fortune Men,Nadifa Mohamed,Cardiff,,The Fortune Men,,"(Cardiff, Cymru / Wales, CF10 2AF, United King...",,,"(Cardiff, Cymru / Wales, CF10 2AF, United King..."
95,Nice Work,David Lodge,Rummidge,,Nice Work,,,,,
117,Breakfast on Pluto,Patrick McCabe,Piccadilly Circus,,Breakfast on Pluto,,"(Piccadilly Circus, Saint Giles, Bloomsbury, C...",,,"(Piccadilly Circus, Saint Giles, Bloomsbury, C..."
130,Bruno's Dream,Iris Murdoch,London,,Bruno's Dream,,"(London, Greater London, England, United Kingd...",,,"(London, Greater London, England, United Kingd..."
173,I'll Go to Bed at Noon,Gerard Woodward,Palmers Green,,I'll Go to Bed at Noon,,"(Palmers Green, London Borough of Enfield, Lon...",,,"(Palmers Green, London Borough of Enfield, Lon..."


In [249]:
k = 200
obj.df_city_country.iloc[k:k+20]

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location
437,The Eagle's Conquest,,"Gesoriacum, Isle of Thanet",Roman Empire,Under the Eagle,"Gesoriacum, Isle of Thanet, Roman Empire",,"(New Lotus House, 485, Roman Road, Old Ford, B...",
439,Dark Cloud,,Tenochtítlan,Mexico,Aztec (novel),"Tenochtítlan, Mexico","(Tenochtitlan, Durango, Municipio de Durango, ...","(Ciudad de México, Cuauhtémoc, Ciudad de Méxic...","(Tenochtitlan, Durango, Municipio de Durango, ..."
440,The Stars in the Bright Sky,Alan Warner,London,United Kingdom,The Stars in the Bright Sky,"London, United Kingdom","(London, Greater London, England, United Kingd...","(United Kingdom, (54.7023545, -3.2765753))","(London, Greater London, England, United Kingd..."
443,The Athenian Murders,José Carlos Somoza,Athens,Greece,The Athenian Murders,"Athens, Greece","(Athens-Clarke County Unified Government, Athe...","(Ελλάς, (38.9953683, 21.9877132))","(Αθήνα, Δήμος Αθηναίων, Περιφερειακή Ενότητα Κ..."
444,The Lymond Chronicles,[Dorothy Dunnett],Paris,France,Lymond Chronicles,"Paris, France","(Paris, Île-de-France, France métropolitaine, ...","(France, (46.603354, 1.8883335))","(Paris, Île-de-France, France métropolitaine, ..."
446,The Jade Peony,Wayson Choy,Vancouver,Canada,The Jade Peony,"Vancouver, Canada","(Vancouver, Metro Vancouver Regional District,...","(Canada, (61.0666922, -107.991707))","(Vancouver, Metro Vancouver Regional District,..."
447,Life and Fate,Vasily Grossman,Stalingrad,Soviet Union,Life and Fate,"Stalingrad, Soviet Union","(Волгоград, Волгоградская область, Южный федер...","(Soviet Union, Strada Lacului, Slobozia, Ialom...",
448,Doctor Zhivago,Boris Pasternak,Yuriatin (Perm),Russia,Doctor Zhivago (novel),"Yuriatin (Perm), Russia",,"(Россия, (64.6863136, 97.7453061))",
449,Frankissstein: A Love Story,Jeanette Winterson,Geneva,Switzerland,Frankissstein,"Geneva, Switzerland","(Genève, Schweiz/Suisse/Svizzera/Svizra, (46.2...","(Schweiz/Suisse/Svizzera/Svizra, (46.7985624, ...","(Genève, Schweiz/Suisse/Svizzera/Svizra, (46.2..."
450,Heat and Dust,Ruth Prawer Jhabvala,Satipur,India,Heat and Dust,"Satipur, India",,"(India, (22.3511148, 78.6677428))",


In [245]:
obj.df_list[0]

Unnamed: 0,title,author,city,country,page,city_country
1,The Mars Room,Rachel Kushner,San Francisco,United States,The Mars Room,"San Francisco, United States"
2,The Missing Italian Girl,Barbara Corrado Pope,Paris,France,The Missing Italian Girl,"Paris, France"
4,Brazil Red,Jean-Christophe Rufin,Rio de Janeiro,Brazil,Brazil Red,"Rio de Janeiro, Brazil"
5,New York: A Novel,Edward Rutherfurd,New York City,United States,New York (novel),"New York City, United States"
7,According to Mark,Penelope Lively,London,England,According to Mark,"London, England"
...,...,...,...,...,...,...
665,Journey,James Michener,"Dawson, Yukon",Canada,Journey (novel),"Dawson, Yukon, Canada"
666,Footsteps,Pramoedya Ananta Toer,Batavia (today Indonesia),Indonesia,Jejak Langkah,"Batavia (today Indonesia), Indonesia"
667,The Rotters' Club,Jonathan Coe,Birmingham,United Kingdom,The Rotters' Club (novel),"Birmingham, United Kingdom"
669,Disgrace,J. M. Coetzee,Cape Town,South Africa,Disgrace,"Cape Town, South Africa"


In [244]:
obj.book_df

Unnamed: 0,title,author,city,country,page,city_country,city_location,country_location,city_country_location
0,"Ducks, Newburyport",,Newburyport,,"Ducks, Newburyport",,"(Newburyport, Essex County, Massachusetts, 019...",,
20,The Comfort of Strangers,Ian McEwan,Venice,,The Comfort of Strangers,,"(Venezia, Veneto, 30121-30176, Italia, (45.437...",,
29,The Unlikely Pilgrimage of Harold Fry,Rachel Joyce,Berwick,,The Unlikely Pilgrimage of Harold Fry,,"(Berwick-upon-Tweed, Northumberland, North of ...",,
46,If Nobody Speaks of Remarkable Things,Jon McGregor,Bradford,,If Nobody Speaks of Remarkable Things,,"(Bradford, West Yorkshire, England, United Kin...",,
68,The Birds on the Trees,Nina Bawden,London,,The Birds on the Trees,,"(London, Greater London, England, United Kingd...",,
...,...,...,...,...,...,...,...,...,...
665,Journey,James Michener,"Dawson, Yukon",Canada,Journey (novel),"Dawson, Yukon, Canada","(Dawson City, Yukon, Y0B 1G0, Canada, (64.0606...","(Canada, (61.0666922, -107.991707))","(Dawson City, Yukon, Y0B 1G0, Canada, (64.0606..."
666,Footsteps,Pramoedya Ananta Toer,Batavia (today Indonesia),Indonesia,Jejak Langkah,"Batavia (today Indonesia), Indonesia",,"(Indonesia, (-2.4833826, 117.8902853))",
667,The Rotters' Club,Jonathan Coe,Birmingham,United Kingdom,The Rotters' Club (novel),"Birmingham, United Kingdom","(Birmingham, West Midlands Combined Authority,...","(United Kingdom, (54.7023545, -3.2765753))","(Birmingham, West Midlands Combined Authority,..."
669,Disgrace,J. M. Coetzee,Cape Town,South Africa,Disgrace,"Cape Town, South Africa","(Cape Town, City of Cape Town, Western Cape, 8...","(South Africa, (-28.8166236, 24.991639))","(Cape Town, City of Cape Town, Western Cape, 8..."


In [243]:
obj.book_df.to_csv('book_db.csv')

In [209]:
df = df.assign(a=np.nan, b=np.nan)
df

Unnamed: 0,title,author,city,country,page,test,x,y,z,a,b
0,"Ducks, Newburyport",,Newburyport,,"Ducks, Newburyport",,yyy,yyys,yyyyyys,,
1,The Mars Room,Rachel Kushner,San Francisco,United States,The Mars Room,,yyy,yyys,yyyyyys,,
2,The Missing Italian Girl,Barbara Corrado Pope,Paris,France,The Missing Italian Girl,,yyy,yyys,yyyyyys,,
3,The Famished Road,Ben Okri,,Nigeria,The Famished Road,,yyy,yyys,yyyyyys,,
4,Brazil Red,Jean-Christophe Rufin,Rio de Janeiro,Brazil,Brazil Red,,yyy,yyys,yyyyyys,,
...,...,...,...,...,...,...,...,...,...,...,...
671,The Essence of the Thing,Madeleine St John,London,,The Essence of the Thing,,yyy,yyys,yyyyyys,,
672,Aag Ka Darya,Qurratulain Hyder,,India,Aag Ka Darya,,yyy,yyys,yyyyyys,,
673,Witiko,Adalbert Stifter,,,Witiko of Prčice,,yyy,yyys,yyyyyys,,
674,A Fairly Honourable Defeat,Iris Murdoch,,,A Fairly Honourable Defeat,,yyy,yyys,yyyyyys,,


In [13]:
data_dir = './../data/json/'

In [192]:
df['z'] = df['x'] + df['y']

In [193]:
df

Unnamed: 0,title,author,city,country,page,test,x,y,z
0,"Ducks, Newburyport",,Newburyport,,"Ducks, Newburyport",,yyy,yyys,yyyyyys
1,The Mars Room,Rachel Kushner,San Francisco,United States,The Mars Room,,yyy,yyys,yyyyyys
2,The Missing Italian Girl,Barbara Corrado Pope,Paris,France,The Missing Italian Girl,,yyy,yyys,yyyyyys
3,The Famished Road,Ben Okri,,Nigeria,The Famished Road,,yyy,yyys,yyyyyys
4,Brazil Red,Jean-Christophe Rufin,Rio de Janeiro,Brazil,Brazil Red,,yyy,yyys,yyyyyys
...,...,...,...,...,...,...,...,...,...
671,The Essence of the Thing,Madeleine St John,London,,The Essence of the Thing,,yyy,yyys,yyyyyys
672,Aag Ka Darya,Qurratulain Hyder,,India,Aag Ka Darya,,yyy,yyys,yyyyyys
673,Witiko,Adalbert Stifter,,,Witiko of Prčice,,yyy,yyys,yyyyyys
674,A Fairly Honourable Defeat,Iris Murdoch,,,A Fairly Honourable Defeat,,yyy,yyys,yyyyyys


In [15]:
data_files = os.listdir(data_dir)
data_files

['Ducks, Newburyport.json',
 'The Mars Room.json',
 'The Missing Italian Girl.json',
 'The Famished Road.json',
 'Brazil Red.json',
 'New York (novel).json',
 'Bring Larks and Heroes.json',
 'According to Mark.json',
 'Briefing for a Descent into Hell.json',
 'Alaska (novel).json',
 'The Colour of Blood.json',
 "Mrs Eckdorf in O'Neill's Hotel.json",
 'Aubrey–Maturin series.json',
 "St. Urbain's Horseman.json",
 'The Last Empress (novel).json',
 'Fasting, Feasting.json',
 'Bewilderment.json',
 'Parrot and Olivier in America.json',
 'This Thing of Darkness.json',
 'Empress (novel).json',
 'The Comfort of Strangers.json',
 'The Playmaker.json',
 'Noble House (book).json',
 'Such a Fun Age.json',
 'The Rover (novel).json',
 'Sivagamiyin Sabadham.json',
 'The Emigrants (novels).json',
 'La Celestina.json',
 'Martin Dressler: The Tale of an American Dreamer.json',
 'The Unlikely Pilgrimage of Harold Fry.json',
 'Love and Summer.json',
 "The Handmaid's Tale.json",
 'La regenta.json',
 'Elizab

In [26]:
books_list = []
for data in data_files:
    with open(data_dir + data, 'r') as f:
        book = json.load(f)
        books_list.append(book)


        

In [131]:
df.page.nunique()

676

In [110]:
df = pd.DataFrame(books_list)

In [111]:
df.replace('NA', None, inplace=True)
df.replace('', None, inplace=True)

In [None]:
def str_concat(str1, str2):
    if str1 is None:
        return None

In [128]:
df.loc[df.city.isna()].apply(lambda x: geolocator.geocode(x.city), axis=1)

KeyboardInterrupt: 

In [68]:
geolocator = Nominatim(user_agent="geobooks")

In [None]:
df['city_location_obj'] = .apply(lambda x: geolocator.geocode(x.city), axis=1)

In [None]:
df.apply(lambda x: geolocator.geocode(x.city), axis=1)

In [132]:
df['test'] = np.nan
df

Unnamed: 0,title,author,city,country,page,test
0,"Ducks, Newburyport",,Newburyport,,"Ducks, Newburyport",
1,The Mars Room,Rachel Kushner,San Francisco,United States,The Mars Room,
2,The Missing Italian Girl,Barbara Corrado Pope,Paris,France,The Missing Italian Girl,
3,The Famished Road,Ben Okri,,Nigeria,The Famished Road,
4,Brazil Red,Jean-Christophe Rufin,Rio de Janeiro,Brazil,Brazil Red,
...,...,...,...,...,...,...
671,The Essence of the Thing,Madeleine St John,London,,The Essence of the Thing,
672,Aag Ka Darya,Qurratulain Hyder,,India,Aag Ka Darya,
673,Witiko,Adalbert Stifter,,,Witiko of Prčice,
674,A Fairly Honourable Defeat,Iris Murdoch,,,A Fairly Honourable Defeat,


In [704]:
location = geolocator.geocode('Travancore, India', exactly_one=True, language="en", addressdetails=True)

In [705]:
location

Location(Travancore Medical College Hospital, Salem - Kochi - Kanyakumari Road, Mevaram, Kollam, Kerala, 691589, India, (8.875029699999999, 76.64485857994526, 0.0))

In [586]:
', '.join(filter(None, [location.raw['address'].get('city'), location.raw['address'].get('country')]))


'India'

In [587]:
', '.join(filter(None, ['kolkata', location.raw['address'].get('country')]))


'kolkata, India'

In [579]:
None + 'hi'

TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

In [562]:
y = geolocator.geocode('India')

In [566]:
y.raw['address']

KeyError: 'address'

In [208]:
print(location.address)

Nanno, Ville d'Anaunia, Comunità della Val di Non, Provincia di Trento, Trentino-Alto Adige/Südtirol, 38093, Italia


In [93]:
location is None

True

In [79]:
print((location.latitude, location.longitude))

(5.4065013, 100.2559077)


In [301]:
print((location.latitude, location.longitude))

(40.7127281, -74.0060152)


In [106]:
df.loc[df.page=='The House of Doors']

Unnamed: 0,title,author,city,country,page
153,The House of Doors,Tan Twan Eng,Penang,Federated Malay States,The House of Doors


In [112]:
df.apply(lambda x: print(x.city + ', ' + x.country), axis=1)

TypeError: can only concatenate str (not "NoneType") to str

In [108]:
df.apply(lambda x: geolocator.geocode(x.city), axis=1)

KeyboardInterrupt: 

In [87]:
df.loc[df.page=='The House of Doors'].apply(lambda x: print(x.city), axis=1)

Penang


153    None
dtype: object

In [47]:
df # 676 books total

Unnamed: 0,title,author,city,country,page
0,"Ducks, Newburyport",,Newburyport,,"Ducks, Newburyport"
1,The Mars Room,Rachel Kushner,San Francisco,United States,The Mars Room
2,The Missing Italian Girl,Barbara Corrado Pope,Paris,France,The Missing Italian Girl
3,The Famished Road,Ben Okri,,Nigeria,The Famished Road
4,Brazil Red,Jean-Christophe Rufin,Rio de Janeiro,Brazil,Brazil Red
...,...,...,...,...,...
671,The Essence of the Thing,Madeleine St John,London,,The Essence of the Thing
672,Aag Ka Darya,Qurratulain Hyder,,India,Aag Ka Darya
673,Witiko,Adalbert Stifter,,,Witiko of Prčice
674,A Fairly Honourable Defeat,Iris Murdoch,,,A Fairly Honourable Defeat


In [58]:
df_city_country = df.loc[~(df.city.isna()) & ~(df.country.isna())] # got...only 312 books with both?
df_city_country

Unnamed: 0,title,author,city,country,page
1,The Mars Room,Rachel Kushner,San Francisco,United States,The Mars Room
2,The Missing Italian Girl,Barbara Corrado Pope,Paris,France,The Missing Italian Girl
4,Brazil Red,Jean-Christophe Rufin,Rio de Janeiro,Brazil,Brazil Red
5,New York: A Novel,Edward Rutherfurd,New York City,United States,New York (novel)
7,According to Mark,Penelope Lively,London,England,According to Mark
...,...,...,...,...,...
665,Journey,James Michener,"Dawson, Yukon",Canada,Journey (novel)
666,Footsteps,Pramoedya Ananta Toer,Batavia (today Indonesia),Indonesia,Jejak Langkah
667,The Rotters' Club,Jonathan Coe,Birmingham,United Kingdom,The Rotters' Club (novel)
669,Disgrace,J. M. Coetzee,Cape Town,South Africa,Disgrace


In [61]:
df_city_country.country.unique()

array(['United States', 'France', 'Brazil', 'England', 'United Kingdom',
       'Ireland', 'Canada', 'China', 'Australia',
       'Hong Kong (British Crown Colony)', 'Sweden', 'Spain', 'Poland',
       'Mexico', 'Great Britain', 'Sicily', 'Pakistan', 'USA', 'Israel',
       'Scotland', 'Italy', 'Germany', 'Global, various countries',
       'Greece', 'Nigeria', 'India', 'Dominican Republic', 'Belgium',
       'Portugal', 'Libya', 'Japan', 'Roman Empire', 'Sri Lanka',
       'Ukraine', 'Persia/Iran', 'Britannia', 'Egypt',
       'Federated Malay States', 'Philippines', 'Northern Ireland',
       'Haiti', 'Zambia/Democratic Republic of Congo', 'Morocco',
       'Tanzania', 'Roman Britain', 'England and France', 'Wales',
       'Hungary', 'Afghanistan', 'Dutch East Indies (now Indonesia)',
       'Russia', 'Malaysia', 'England, Ukraine', 'Roman Republic',
       'Austria-Hungary', 'Netherlands', 'Dorset', 'South Africa',
       'Georgia', 'Bosnia and Herzegovina', 'Kent', 'Sicily, Italy',

In [62]:
df.country.unique()

array([nan, 'United States', 'France', 'Nigeria', 'Brazil', 'Australia',
       'England', 'United Kingdom', 'Poland', 'Ireland', 'Canada',
       'China', 'India', 'Hong Kong (British Crown Colony)', 'Sweden',
       'Spain', 'Pakistan', 'Mexico', 'Italy', 'Great Britain',
       'Northern Ireland', 'Sicily', 'Egypt', 'Europe', 'Philippines',
       'Afghanistan', 'USA', 'Israel', 'Scotland', 'Danu', 'Britain',
       'Vietnam', 'Germany', 'Global, various countries', 'Greece', 'UK',
       'Zimbabwe', 'Dominican Republic', 'North Africa',
       'Solomon Islands', 'Netherlands', 'South Africa', 'Belgium',
       'Japan', 'Portugal', 'Soviet Union', 'Wales', 'Libya',
       'West Africa', 'Roman Empire', 'Sri Lanka', 'Ukraine',
       'Persia/Iran', 'Britannia', 'Federated Malay States', 'Haiti',
       'Greenland', 'Zambia/Democratic Republic of Congo', 'Morocco',
       'Russia', 'Tanzania', 'Near East', 'Roman Britain', 'Tasmania',
       'England and France', 'Hungary', 'Argentina

In [63]:
df.city.unique()

array(['Newburyport', 'San Francisco', 'Paris', nan, 'Rio de Janeiro',
       'New York City', 'London', 'Nome', 'Dublin', 'London and Montreal',
       'Beijing', 'Venice', 'Sydney Cove', 'Hong Kong', 'Hyères',
       'Karlshamn', 'Berwick', 'Rathmoye', 'Vetusta', 'Warsaw',
       'Kilnalough (fictional)', 'Toledo', 'Ashby-De-La-Zouch',
       'Bradford', 'Guildford', 'Virginia', 'Racalmuto', 'Santa Fé',
       'Lahore', 'California', 'Makor', 'Glasgow', 'Palermo',
       'Germany (Görlitz)',
       'London- proper and Marin County, California (and other fictional locations)',
       'Valladolid', 'Salisbury', 'Syracuse', 'Enugu', 'Adelaide',
       'Srinagar', 'Cardiff', 'Santo Domingo', 'Rummidge', 'Calcutta',
       'Kortrijk', 'Lisbon', 'Arfon', 'Shanghai', 'Piccadilly Circus',
       'Tripoli', 'Hiroshima', 'Ostia Antica', 'Colombo',
       'Appomattox Court House', 'Masada', 'Kiev', 'Canada', 'Esfahan',
       'Londinium', 'Nanjing', 'Bombay', 'Rio', 'Corrigan', 'Cairo',
       

In [129]:
df_country = df.loc[(df.city.isna()) & ~(df.country.isna())] # country but no city...
df_country

Unnamed: 0,title,author,city,country,page
3,The Famished Road,Ben Okri,,Nigeria,The Famished Road
6,Bring Larks and Heroes,Thomas Keneally,,Australia,Bring Larks and Heroes
10,The Colour of Blood,Brian Moore,,Poland,The Colour of Blood
14,The Last Empress,Anchee Min,,China,The Last Empress (novel)
15,"Fasting, Feasting",Anita Desai,,India,"Fasting, Feasting"
...,...,...,...,...,...
651,Under the Frog,Tibor Fischer,,Hungary,Under the Frog
654,Lost Children Archive,Valeria Luiselli,,U.S.,Lost Children Archive
663,From Scenes Like These,Gordon Williams,,Scotland,From Scenes Like These
672,Aag Ka Darya,Qurratulain Hyder,,India,Aag Ka Darya


In [60]:
df_city = df.loc[~(df.city.isna()) & (df.country.isna())]
df_city.head()

Unnamed: 0,title,author,city,country,page
0,"Ducks, Newburyport",,Newburyport,,"Ducks, Newburyport"
20,The Comfort of Strangers,Ian McEwan,Venice,,The Comfort of Strangers
29,The Unlikely Pilgrimage of Harold Fry,Rachel Joyce,Berwick,,The Unlikely Pilgrimage of Harold Fry
46,If Nobody Speaks of Remarkable Things,Jon McGregor,Bradford,,If Nobody Speaks of Remarkable Things
68,The Birds on the Trees,Nina Bawden,London,,The Birds on the Trees
