In [1]:
import pandas as pd
import sqlalchemy as sa
import pywikidata as wp

In [2]:
# Read a sample dataset of 39000 cities from wikidata with their population
df_wiki = pd.read_csv('sample_data/query_cities_population_gps.csv')
df_wiki.head()

Unnamed: 0,cityLabel,population,gps
0,Edo,http://www.wikidata.org/.well-known/genid/87cd...,Point(139.774444444 35.683888888)
1,Barxelas,0,Point(-7.845761111 42.370038888)
2,A Garduñeira,0,Point(-7.968938888 42.340813888)
3,Q61379203,0,Point(-7.481878 42.713333)
4,Q20546571,0,Point(-7.679743 42.339763)


In [3]:
# MusicBrainz database credentials
HOST = "localhost"
DATABASE = "musicbrainz_db"
USER = "musicbrainz"
PORT = 5432

sa_conn_str = f"postgresql+psycopg2://{USER}@{HOST}:{PORT}/{DATABASE}"
engine = sa.create_engine(sa_conn_str)

In [11]:
with engine.connect() as conn:
    query = '''
    SELECT a.name as name, at.name as type
    FROM area AS a 
    LEFT JOIN area_type as at 
    ON a.type = at.id;
    '''
    
    df_musicbrainz = pd.read_sql(sa.text(query), conn)

# Fetches all ~120k areas from the MusicBrainz database with their area type
df_musicbrainz

Unnamed: 0,name,type
0,Greccio,Municipality
1,Canada,Country
2,Chile,Country
3,China,Country
4,Cambodia,Country
...,...,...
118959,Fairview Beach,City
118960,Sherman Oaks,District
118961,Islay,Island
118962,Bowmore,City


In [12]:
# Convert all names to lowercase for comparision
df_musicbrainz['name'] = df_musicbrainz['name'].str.lower()
df_wiki['cityLabel'] = df_wiki['cityLabel'].str.lower()

In [13]:
not_in_MeB = df_wiki[~df_wiki.cityLabel.isin(df_musicbrainz.name)]
is_in_MeB = df_wiki[df_wiki.cityLabel.isin(df_musicbrainz.name)]

In [18]:
# Previewing cities that are NOT in MusicBrainz database
not_in_MeB.head(10)

Unnamed: 0,cityLabel,population,gps
1,barxelas,0,Point(-7.845761111 42.370038888)
2,a garduñeira,0,Point(-7.968938888 42.340813888)
3,q61379203,0,Point(-7.481878 42.713333)
4,q20546571,0,Point(-7.679743 42.339763)
5,lousadela,0,Point(-7.238972222 42.604722222)
6,q12398919,0,Point(-6.965963888 42.451605555)
7,q12393950,0,Point(-6.984194444 42.439966666)
8,q12381764,0,Point(-7.69 42.316111)
9,puerto navarino,0,Point(-68.3239 -54.9256)
20,mundobriga,0,Point(-1.7 41.25)


In [19]:
# Only previewing cities with population > 1000, 
# as cities with population < 1000 are most likely fake in our case.
not_in_MeB[not_in_MeB.population.astype(float) > 1000]

Unnamed: 0,cityLabel,population,gps
9813,vetrino,1001,Point(27.435355 43.317169)
9814,bârza,1001,Point(24.144543 44.324595)
9815,dracea,1001,Point(25.01666667 43.86666667)
9816,măceșu de jos,1001,Point(23.71666667 43.88333333)
9820,nușeni,1002,Point(24.2 47.1)
...,...,...,...
38874,saint petersburg,5384342,Point(30.316666666 59.95)
38889,pudong,5681512,Point(121.539722222 31.223055555)
38944,new york city,8804190,Point(-74.0 40.7)
38953,mexico city,9209944,Point(-99.145555555 19.419444444)
