In [1]:
# !pip install haversine

In [280]:
import pandas as pd
import homebrew as hb

In [281]:
%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_json('artists_shows_shard.json', lines=True)

In [4]:
print(df.shape)
df.head(3)

(15729, 3)


Unnamed: 0,artist,date,loc
0,2 Chainz,2019-06-30 05:30:00,"Las Vegas, NV, US"
1,2 Chainz,2019-07-12 11:00:00,"London, UK"
2,2 Chainz,2019-08-10 00:00:00,"Anaheim, CA, US"


The column named loc should be renamed to prevent confusion  
with the 'pandas.DataFrame.loc()' method

In [5]:
df.columns = ['artist', 'date', 'location']
df.head(3)

Unnamed: 0,artist,date,location
0,2 Chainz,2019-06-30 05:30:00,"Las Vegas, NV, US"
1,2 Chainz,2019-07-12 11:00:00,"London, UK"
2,2 Chainz,2019-08-10 00:00:00,"Anaheim, CA, US"


Trim shows outside US

In [6]:
domestic_df = df[df['location'].str.endswith('US')]
print(domestic_df.shape)
domestic_df.head(3)

(9608, 3)


Unnamed: 0,artist,date,location
0,2 Chainz,2019-06-30 05:30:00,"Las Vegas, NV, US"
2,2 Chainz,2019-08-10 00:00:00,"Anaheim, CA, US"
3,2 Chainz,2019-08-24 22:00:00,"Atlanta, GA, US"


Lets only make decisions based on shows with in the past five years.  
Trim any concerts before 2015.

In [7]:
recent_domestic_df = domestic_df[domestic_df.date >= '2015'].copy()
print(recent_domestic_df.shape)

(8642, 3)


### Lets merge/join some lat long info to these locations so we can compare them!

With some help from our friends at https://simplemaps.com/data/us-cities:  
We can get a csv of cities and their latititude and longitude from some census.

In [8]:
cities_df = pd.read_csv('uscitiesv1.5.csv')

In [9]:
print(cities_df.shape)
cities_df.head(2)

(37842, 16)


Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,population_proper,density,source,incorporated,timezone,zips,id
0,Prairie Ridge,Prairie Ridge,WA,Washington,53053,Pierce,47.1443,-122.1408,,,1349.8,polygon,False,America/Los_Angeles,98360 98391,1840037882
1,Edison,Edison,WA,Washington,53057,Skagit,48.5602,-122.4311,,,127.4,polygon,False,America/Los_Angeles,98232,1840017314


Let's just grab city, state id, lat and lng.

In [10]:
cities_df = cities_df[['city', 'state_id', 'lat', 'lng']]
cities_df.head(2)

Unnamed: 0,city,state_id,lat,lng
0,Prairie Ridge,WA,47.1443,-122.1408
1,Edison,WA,48.5602,-122.4311


Lets adjust these city and state values so they will match

In [11]:
cities_df.loc[:,'comb_city_state'] = cities_df['city'] + ', ' + cities_df['state_id'] + ', US'

In [12]:
cities_df.head()

Unnamed: 0,city,state_id,lat,lng,comb_city_state
0,Prairie Ridge,WA,47.1443,-122.1408,"Prairie Ridge, WA, US"
1,Edison,WA,48.5602,-122.4311,"Edison, WA, US"
2,Packwood,WA,46.6085,-121.6702,"Packwood, WA, US"
3,Wautauga Beach,WA,47.5862,-122.5482,"Wautauga Beach, WA, US"
4,Harper,WA,47.5207,-122.5196,"Harper, WA, US"


In [13]:
combined = pd.merge(left=recent_domestic_df, right=cities_df,
             left_on='location', right_on='comb_city_state', how='left')

In [14]:
combined.head()

Unnamed: 0,artist,date,location,city,state_id,lat,lng,comb_city_state
0,2 Chainz,2019-06-30 05:30:00,"Las Vegas, NV, US",Las Vegas,NV,36.2291,-115.2607,"Las Vegas, NV, US"
1,2 Chainz,2019-08-10 00:00:00,"Anaheim, CA, US",Anaheim,CA,33.839,-117.8572,"Anaheim, CA, US"
2,2 Chainz,2019-08-24 22:00:00,"Atlanta, GA, US",Atlanta,GA,33.7626,-84.4228,"Atlanta, GA, US"
3,2 Chainz,2019-05-27 05:30:00,"Las Vegas, NV, US",Las Vegas,NV,36.2291,-115.2607,"Las Vegas, NV, US"
4,2 Chainz,2019-05-19 00:00:00,"Detroit, MI, US",Detroit,MI,42.3834,-83.1024,"Detroit, MI, US"


Confirm shape matches old shape, no DUPs

In [15]:
recent_domestic_df.shape[0] == combined.shape[0]

True

Are there any missing values?

In [16]:
print(combined.shape)
combined.isna().sum()

(8642, 8)


artist               0
date                 0
location             0
city               276
state_id           276
lat                276
lng                276
comb_city_state    276
dtype: int64

Where is it failing to connect?

In [17]:
combined[combined['city'].isna()][:18]

Unnamed: 0,artist,date,location,city,state_id,lat,lng,comb_city_state
56,2 Chainz,2017-09-16 00:00:00,"St Petersburg, FL, US",,,,,
68,2 Chainz,2017-08-30 01:00:00,"St Louis, MO, US",,,,,
71,2 Chainz,2017-08-26 02:30:00,"St. Paul, MN, US",,,,,
86,2 Chainz,2017-04-23 23:00:00,"Amherst, MA, US",,,,,
87,2 Chainz,2017-04-22 19:00:00,"Pozo, CA, US",,,,,
108,21 Savage,2019-07-31 01:00:00,"St. Paul, MN, US",,,,,
147,21 Savage,2018-05-27 00:00:00,"Darien Center, NY, US",,,,,
148,21 Savage,2018-05-24 23:00:00,"Mansfield, MA, US",,,,,
176,21 Savage,2017-12-09 02:00:00,"St Louis, MO, US",,,,,
198,21 Savage,2017-09-15 15:45:00,"Flushing, NY, US",,,,,


Won't get to all of these but probably can fix the 'saint's.  
How are they spelled in the csv?

In [18]:
cities_df[cities_df['city'].str.contains('Saint ')][:3]

Unnamed: 0,city,state_id,lat,lng,comb_city_state
657,Saint John,WA,47.0915,-117.5887,"Saint John, WA, US"
1232,Saint Charles,VA,36.8052,-83.0575,"Saint Charles, VA, US"
1592,Saint Paul,VA,36.9069,-82.3164,"Saint Paul, VA, US"


In [19]:
print(cities_df[cities_df['comb_city_state'].str.contains('Saint ')].shape)
print(cities_df[cities_df['comb_city_state'].str.contains('St. ')].shape)
print(cities_df[cities_df['comb_city_state'].str.contains('St ')].shape)

(249, 5)
(1, 5)
(0, 5)


In [20]:
cities_df[cities_df['city'].str.contains('St. ')]

Unnamed: 0,city,state_id,lat,lng,comb_city_state
33971,St. John,KS,38.0,-98.7611,"St. John, KS, US"


In the census csv, almost all the cities with 'Saint' in the name have it spelled out, not abbreviated ('St' or 'St.')  
Lets adjust the St John from the csv and then regularize these in our concert data

In [21]:
# Regularize cities csv "St. John Kansas"
cities_df.loc[:,'comb_city_state'] = cities_df['comb_city_state'].str.replace('St. ', 'Saint ')

In [22]:
cities_df[cities_df['comb_city_state'].str.contains('St. ')]

Unnamed: 0,city,state_id,lat,lng,comb_city_state


In [23]:
print(recent_domestic_df[recent_domestic_df['location'].str.contains('Saint ')].shape)
print(recent_domestic_df[recent_domestic_df['location'].str.contains('St. ')].shape)
print(recent_domestic_df[recent_domestic_df['location'].str.contains('St ')].shape)

(20, 3)
(58, 3)
(73, 3)


In [24]:
recent_domestic_df.loc[:, 'location'] = recent_domestic_df['location'].str.replace('St. ', 'St ')
recent_domestic_df.loc[:, 'location'] = recent_domestic_df['location'].str.replace('St ', 'Saint ')

In [25]:
display(recent_domestic_df[recent_domestic_df['location'].str.contains('St. ')])
display(recent_domestic_df[recent_domestic_df['location'].str.contains('St ')])

Unnamed: 0,artist,date,location


Unnamed: 0,artist,date,location


In [26]:
combined = pd.merge(left=recent_domestic_df, right=cities_df,
             left_on='location', right_on='comb_city_state', how='left')

In [27]:
print(combined.shape)
combined.isna().sum()

(8642, 8)


artist               0
date                 0
location             0
city               145
state_id           145
lat                145
lng                145
comb_city_state    145
dtype: int64

In [28]:
n = 145
N = 8642 + 145
print(f'{n} out of {N}, or {1- round(n/N,2)}%... Not bad')

145 out of 8787, or 0.98%... Not bad


In [29]:
combined.dropna(inplace=True)   # Getting rid of those nan's

In [30]:
combined.head()

Unnamed: 0,artist,date,location,city,state_id,lat,lng,comb_city_state
0,2 Chainz,2019-06-30 05:30:00,"Las Vegas, NV, US",Las Vegas,NV,36.2291,-115.2607,"Las Vegas, NV, US"
1,2 Chainz,2019-08-10 00:00:00,"Anaheim, CA, US",Anaheim,CA,33.839,-117.8572,"Anaheim, CA, US"
2,2 Chainz,2019-08-24 22:00:00,"Atlanta, GA, US",Atlanta,GA,33.7626,-84.4228,"Atlanta, GA, US"
3,2 Chainz,2019-05-27 05:30:00,"Las Vegas, NV, US",Las Vegas,NV,36.2291,-115.2607,"Las Vegas, NV, US"
4,2 Chainz,2019-05-19 00:00:00,"Detroit, MI, US",Detroit,MI,42.3834,-83.1024,"Detroit, MI, US"


In [31]:
combined['comb_lat_long'] = combined.lat.astype(str) + ', ' + combined.lng.astype(str)
# combined['comb_lat_long'] = combined['comb_lat_long'].str.split(', ')

In [32]:
combined.head()

Unnamed: 0,artist,date,location,city,state_id,lat,lng,comb_city_state,comb_lat_long
0,2 Chainz,2019-06-30 05:30:00,"Las Vegas, NV, US",Las Vegas,NV,36.2291,-115.2607,"Las Vegas, NV, US","36.2291, -115.2607"
1,2 Chainz,2019-08-10 00:00:00,"Anaheim, CA, US",Anaheim,CA,33.839,-117.8572,"Anaheim, CA, US","33.839, -117.8572"
2,2 Chainz,2019-08-24 22:00:00,"Atlanta, GA, US",Atlanta,GA,33.7626,-84.4228,"Atlanta, GA, US","33.7626, -84.4228"
3,2 Chainz,2019-05-27 05:30:00,"Las Vegas, NV, US",Las Vegas,NV,36.2291,-115.2607,"Las Vegas, NV, US","36.2291, -115.2607"
4,2 Chainz,2019-05-19 00:00:00,"Detroit, MI, US",Detroit,MI,42.3834,-83.1024,"Detroit, MI, US","42.3834, -83.1024"


### Can we iterate over two artists?

In [43]:
from sklearn.neighbors import  NearestNeighbors

In [146]:
artist_df.loc[62]

location    Cincinnati, OH, US
lat                    39.1412
lng                   -84.5059
Name: 62, dtype: object

In [207]:
artists = combined['artist'].unique()
artists

'3 Doors Down'

In [209]:
main_artist = artists[4]
sim_artist = artists[6]
print('main: ', main_artist)
print('sim: ', sim_artists)

main:  5 Seconds of Summer
sim:  6LACK


In [237]:
main_artist_df = combined.loc[combined['artist']==main_artist,
                              ['location', 'lat', 'lng']]
main_artist_df.drop_duplicates(subset='location', inplace=True)
main_artist_df.reset_index(drop=True, inplace=True)    
                                                                        # concerts dataframe for main artist
print(main_artist_df.shape)
main_artist_df.head(2), main_artist_df.tail(2)

(59, 3)


(             location      lat      lng
 0  Cincinnati, OH, US  39.1412 -84.5059
 1      Boston, MA, US  42.3188 -71.0846,
                  location      lat      lng
 57  Silver Spring, MD, US  39.0028 -77.0207
 58  The Woodlands, TX, US  30.1738 -95.5134)

In [238]:
main_artist_places = main_artist_df['location'].unique()       
                                    # These are the unique string locations eg. 'Manhattan, NY, US'
# main_artist_places

In [265]:
nn = NearestNeighbors(n_neighbors=1, radius=1, n_jobs=-1,)     
nn.fit(main_artist_df[['lat', 'lng']],)                                  # Train nn to remember 2chainz lat longs

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=-1, n_neighbors=1, p=2, radius=1)

In [266]:
sim_artists_places = combined.loc[combined['artist']==sim_artist,'location'].unique()
# sim_artists_places

In [278]:
for place in sim_artists_places[:10]:                                                     # Iterate through 21 Savages unique cities
    if place not in main_artist_places:                                               # If town 2Chanz hasn't played there
                                                                                             # Get the lat lng and assign it to var
        place_latlong = combined.loc[combined['location']==place, ['lat', 'lng']] # keep it 2D array
        place_latlong.drop_duplicates(inplace=True)                                                                 # for nn to predict on 
        print(place, '\n', place_latlong)


        distances, indices = nn.kneighbors(place_latlong, n_neighbors=5)
        for idx in indices:
            print(main_artist_df.loc[idx])
        print(distances, indices, '\n\n')
#         nn_distance, nn_index = distances[0][0], indices[0][0]
# #         print(nn_index)
# #         print(main_artist_df.reset_index(drop=True))
#         neighbor = main_artist_df.loc[nn_index]
        
#         print('\t-', place in main_artist_places, '-')
#         print('_Place_ : ', place)
#         print('_Place_latlong_ : ', place_latlong, '\n')
#         print('dist_to_nn: ', round(nn_distance, 4))
        
# #         print('nn_idx: ', nn_index)
#         print(neighbor.values)
#         print('\n')

None
Memphis, TN, US 
         lat      lng
85  35.1047 -89.9773
               location      lat      lng
11    Nashville, TN, US  36.1714 -86.7844
22  Saint Louis, MO, US  38.6358 -90.2451
45       Rogers, AR, US  36.3173 -94.1514
16  New Orleans, LA, US  30.0687 -89.9288
20   Louisville, KY, US  38.1662 -85.6488
[[3.36637183 3.54124047 4.34666649 5.03623354 5.30176334]] [[11 22 45 16 20]] 


None
Huntington, WV, US 
          lat      lng
595  38.4109 -82.4344
              location      lat      lng
8     Columbus, OH, US  39.9860 -82.9852
0   Cincinnati, OH, US  39.1412 -84.5059
6   Pittsburgh, PA, US  40.4396 -79.9763
20  Louisville, KY, US  38.1662 -85.6488
47   Charlotte, NC, US  35.2079 -80.8303
[[1.66862837 2.19646314 3.18714281 3.22370058 3.58222638]] [[ 8  0  6 20 47]] 


None
Landover, MD, US 
          lat      lng
596  38.9241 -76.8875
                 location      lat      lng
9      Washington, DC, US  38.9047 -77.0163
57  Silver Spring, MD, US  39.0028 -77.0207
53   

In [279]:
for place in sim_artists_places[:8]:                                                     # Iterate through 21 Savages unique cities
    if place not in main_artist_places:                                               # If town 2Chanz hasn't played there
                                                                                             # Get the lat lng and assign it to var
        place_latlong = combined.loc[combined['location']==place, ['lat', 'lng']]
        place_latlong.drop_duplicates(inplace=True)                                        # keep it 2D array
                                                                                            # for nn to predict on         
        distances, indices = nn.kneighbors(place_latlong, n_neighbors=5)
        

        nn_distance, nn_index = distances[0][0], indices[0][0]

        neighbor = main_artist_df.loc[nn_index]
        
        print('\t-', place in main_artist_places, '-')
        print('_Place_ : ', place)
        print('_Place_latlong_ : ', place_latlong.values, '\n')
        print('dist_to_nn: ', round(nn_distance, 4))
        
#         print('nn_idx: ', nn_index)
        print(neighbor.values)
        print('\n')

	- False -
_Place_ :  Memphis, TN, US
_Place_latlong_ :  [[ 35.1047 -89.9773]] 

dist_to_nn:  3.3664
['Nashville, TN, US' 36.1714 -86.7844]


	- False -
_Place_ :  Huntington, WV, US
_Place_latlong_ :  [[ 38.4109 -82.4344]] 

dist_to_nn:  1.6686
['Columbus, OH, US' 39.986 -82.9852]


	- False -
_Place_ :  Landover, MD, US
_Place_latlong_ :  [[ 38.9241 -76.8875]] 

dist_to_nn:  0.1303
['Washington, DC, US' 38.9047 -77.0163]


	- False -
_Place_ :  Chapel Hill, NC, US
_Place_latlong_ :  [[ 35.9269 -79.039 ]] 

dist_to_nn:  1.9302
['Charlotte, NC, US' 35.2079 -80.8303]


	- False -
_Place_ :  Raleigh, NC, US
_Place_latlong_ :  [[ 35.8323 -78.6439]] 

dist_to_nn:  2.2738
['Charlotte, NC, US' 35.2079 -80.8303]




Euclidean Distance does not account for the curvature of the Earth and is therefore not truly accurate, but will only be meaningfully off on rare occasions.  Also the resulting distances are surprisingly close to driving times between cities in hours

In [287]:
potential_new_locations = hb.pilfer_similar_artist(ref_df=combined,
                                                   main_artist='2 Chainz',
                                                   similar_artist='21 Savage')

In [289]:
sorted(potential_new_locations, key=lambda x: x['distance'], reverse=True)

[{'new': 'Albuquerque, NM, US',
  'nearest_old': 'Denver, CO, US',
  'distance': 4.98201329685098},
 {'new': 'Dallas, TX, US',
  'nearest_old': 'Austin, TX, US',
  'distance': 2.6808129102941916},
 {'new': 'Irving, TX, US',
  'nearest_old': 'Oklahoma City, OK, US',
  'distance': 2.665184458907105},
 {'new': 'Clive, IA, US',
  'nearest_old': 'Cedar Rapids, IA, US',
  'distance': 2.1487280167578167},
 {'new': 'Portland, OR, US',
  'nearest_old': 'Seattle, WA, US',
  'distance': 2.1091833893713474},
 {'new': 'Tulsa, OK, US',
  'nearest_old': 'Oklahoma City, OK, US',
  'distance': 1.739777284597078},
 {'new': 'Cleveland, OH, US',
  'nearest_old': 'Detroit, MI, US',
  'distance': 1.686388003989599},
 {'new': 'Indianapolis, IN, US',
  'nearest_old': 'Louisville, KY, US',
  'distance': 1.6858255574050298},
 {'new': 'Panama City Beach, FL, US',
  'nearest_old': 'Tallahassee, FL, US',
  'distance': 1.6393726391519343},
 {'new': 'Simpsonville, SC, US',
  'nearest_old': 'Charlotte, NC, US',
  'di