## Relation Between USA Cities by COVID-19 positive cases

First, we import our libraries

In [1]:
from sklearn.cluster import KMeans
import matplotlib.colors as colors
import matplotlib.cm as cm
import pandas as pd
import numpy as np
import requests
import folium

Now, load the dataset via pandas and show the firts five rows

In [2]:
df_covid = pd.read_csv('time_series_covid_19_confirmed_US.csv')
df_covid.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,9/14/20,9/15/20,9/16/20,9/17/20,9/18/20,9/19/20,9/20/20,9/21/20,9/22/20,9/23/20
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,1447,1463,1619,1624,1664,1673,1690,1691,1714,1715
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,4800,4812,5003,5021,5033,5047,5061,5087,5124,5141
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,626,629,809,809,824,830,835,838,848,851
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,581,580,612,617,619,628,632,636,635,638
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,1128,1139,1487,1504,1527,1542,1551,1560,1573,1580


Now, we can see how many cities we have in our dataset per state

In [3]:
print(len(df_covid['Admin2'].unique()))
df_covid['Province_State'].value_counts()

1979


Texas                       256
Georgia                     161
Virginia                    135
Kentucky                    122
Missouri                    118
Kansas                      107
Illinois                    104
North Carolina              102
Iowa                        101
Tennessee                    97
Nebraska                     95
Indiana                      94
Ohio                         90
Minnesota                    89
Michigan                     87
Mississippi                  84
Puerto Rico                  80
Oklahoma                     79
Arkansas                     77
Wisconsin                    74
Florida                      69
Alabama                      69
Pennsylvania                 69
South Dakota                 68
Colorado                     66
Louisiana                    66
New York                     64
California                   60
Montana                      58
West Virginia                57
North Dakota                 55
South Ca

And, using *Folium* we can see every city in a map

In [4]:
map_texas = folium.Map(location=[37.6, -95.665], zoom_start=4)

for lat, lng, neighborhood in zip(df_covid['Lat'], df_covid['Long_'], df_covid['Admin2']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_texas)
map_texas

To carry out this project, we want to use the 200 main cities with the most cases in the last date (cumulative cases)

In [5]:
top_covid_cities = df_covid.sort_values('9/23/20', ascending=False)[0:200]
top_covid_cities.head(10)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,9/14/20,9/15/20,9/16/20,9/17/20,9/18/20,9/19/20,9/20/20,9/21/20,9/22/20,9/23/20
213,84006037,US,USA,840,6037.0,Los Angeles,California,US,34.308284,-118.228241,...,254656,255049,256148,257271,258516,259817,260797,261446,262133,263333
382,84012086,US,USA,840,12086.0,Miami-Dade,Florida,US,25.611236,-80.551706,...,164299,164688,165147,165595,165980,166516,166881,167153,167515,167880
640,84017031,US,USA,840,17031.0,Cook,Illinois,US,41.841448,-87.816588,...,135274,135690,136246,136993,137580,138576,139088,139583,140078,140623
108,84004013,US,USA,840,4013.0,Maricopa,Arizona,US,33.348359,-112.491815,...,137589,137870,138151,139051,139245,139586,139856,140006,140314,140409
2798,84048201,US,USA,840,48201.0,Harris,Texas,US,29.858649,-95.393395,...,117568,120336,120771,121628,122445,123264,123817,137946,138473,139017
2754,84048113,US,USA,840,48113.0,Dallas,Texas,US,32.766706,-96.77796,...,75648,75838,76149,76607,76962,77118,77889,78205,78205,78723
345,84012011,US,USA,840,12011.0,Broward,Florida,US,26.151847,-80.487256,...,74525,74689,74832,75048,75266,75499,75682,75801,75944,76146
1941,84036081,US,USA,840,36081.0,Queens,New York,US,40.710881,-73.816847,...,71180,71254,71309,71392,71474,71577,71682,71752,71818,71881
1923,84036047,US,USA,840,36047.0,Kings,New York,US,40.636183,-73.949356,...,66274,66381,66468,66586,66737,66971,67136,67227,67391,67510
1813,84032003,US,USA,840,32003.0,Clark,Nevada,US,36.214589,-115.013024,...,62812,62952,63077,63301,63603,63877,64160,64288,64470,64895


Now, we just use the State, City Name, Latitude, Longitude and last date, so we take them and change the column names to make the job easier

In [6]:
top_covid_cities = top_covid_cities[['Province_State', 'Admin2', 'Lat', 'Long_', '9/23/20']]
top_covid_cities.rename(columns={'Admin2': 'City', 'Long_': 'Long', '9/23/20': 'Total', 'Province_State': 'State'}, inplace=True)

In [7]:
top_covid_cities

Unnamed: 0,State,City,Lat,Long,Total
213,California,Los Angeles,34.308284,-118.228241,263333
382,Florida,Miami-Dade,25.611236,-80.551706,167880
640,Illinois,Cook,41.841448,-87.816588,140623
108,Arizona,Maricopa,33.348359,-112.491815,140409
2798,Texas,Harris,29.858649,-95.393395,139017
...,...,...,...,...,...
244,California,Sonoma,38.527464,-122.886251,7225
2169,Ohio,Lucas,41.621012,-83.654686,7205
439,Georgia,Clayton,33.541872,-84.355942,7086
1587,Missouri,Jackson,39.010022,-94.347245,7064


In [8]:
top_covid_cities['State'].value_counts()[0:5]

Florida       23
California    23
Texas         20
New York      11
New Jersey    11
Name: State, dtype: int64

It's time to define our *Forusquare* credentials

In [9]:
CLIENT_ID = 'F5WVH1A0TECURMUPJHJC33ZBNSKR2PFXRWDP2L4HLZGCFLUC' # your Foursquare ID
CLIENT_SECRET = 'JAHBVM44TQE4MK1BOO1CFKZH0BXUXDTS3CX2OTSB0EHEJMWI' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 10 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: F5WVH1A0TECURMUPJHJC33ZBNSKR2PFXRWDP2L4HLZGCFLUC
CLIENT_SECRET:JAHBVM44TQE4MK1BOO1CFKZH0BXUXDTS3CX2OTSB0EHEJMWI


Now, we use a function to create a pandas Dataframe to storage the 10 principal venuees of each city if the *Foursquare API* gives us an answer

In [10]:
def getNearTrendingbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except:
            continue

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City_Lat', 
                  'City_Long', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now, we call the function with the cities and coordinates

In [11]:
top_venues_per_city = getNearTrendingbyVenues(names=top_covid_cities['City'], 
                                   latitudes=top_covid_cities['Lat'],
                                   longitudes=top_covid_cities['Long'])
top_venues_per_city.head()

Los Angeles
Miami-Dade
Cook
Maricopa
Harris
Dallas
Broward
Queens
Kings
Clark
Riverside
Bexar
San Bernardino
Bronx
Orange
Tarrant
Nassau
Suffolk
Palm Beach
San Diego
Hillsborough
Orange
Westchester
Philadelphia
Wayne
New York
Kern
Hidalgo
Shelby
Duval
Salt Lake
Prince George's
Travis
Fresno
Mecklenburg
Milwaukee
Fulton
Middlesex
Gwinnett
Franklin
Hennepin
Davidson
Pima
Suffolk
St. Louis
El Paso
Cameron
Bergen
Montgomery
Sacramento
Pinellas
King
Marion
Essex
Alameda
Santa Clara
Hudson
Fairfax
Nueces
Oakland
Lee
San Joaquin
Fairfield
Polk
Jefferson
Essex
Cobb
Middlesex
Passaic
Providence
DeKalb
Baltimore
Union
Jefferson
Out of GA
Cuyahoga
Oklahoma
Wake
DuPage
Lake
Utah
Stanislaus
Tulsa
Contra Costa
Fort Bend
Richmond
Tulare
Charleston
Polk
Douglas
Baltimore City
District of Columbia
East Baton Rouge
Rockland
Jefferson
Richland
Hartford
New Haven
Macomb
Worcester
Greenville
Will
Webb
Mobile
Collin
Hamilton
Kane
Denver
Yuma
Collier
Ada
Ventura
Orleans
Ocean
Osceola
Prince William
Escambia


Unnamed: 0,City,City_Lat,City_Long,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Cook,41.841448,-87.816588,Chef Shangri-La,41.843112,-87.822079,Asian Restaurant
1,Cook,41.841448,-87.816588,Super H Mart,41.841987,-87.821078,Grocery Store
2,Cook,41.841448,-87.816588,Komb's Beef,41.837933,-87.815256,Hot Dog Joint
3,Cook,41.841448,-87.816588,Veterans Park,41.843621,-87.812093,Baseball Field
4,Maricopa,33.348359,-112.491815,Ak-Chin Southern Dunes Golf Club,33.349017,-112.49102,Golf Driving Range


And we show the count of each values by City

In [12]:
top_venues_per_city.groupby('City').count()

Unnamed: 0_level_0,City_Lat,City_Long,Venue,Venue Latitude,Venue Longitude,Venue Category
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alachua,2,2,2,2,2,2
Alameda,1,1,1,1,1,1
Allegheny,4,4,4,4,4,4
Anne Arundel,2,2,2,2,2,2
Baltimore,10,10,10,10,10,10
...,...,...,...,...,...,...
Tulsa,10,10,10,10,10,10
Union,10,10,10,10,10,10
Utah,1,1,1,1,1,1
Wake,10,10,10,10,10,10


In [13]:
top_venues_per_city.groupby('City').count().sum()

City_Lat           583
City_Long          583
Venue              583
Venue Latitude     583
Venue Longitude    583
Venue Category     583
dtype: int64

To apply clustering, we need to change the format, so we can use the *get_dummies* pandas method to make *One-hot enconding*

In [14]:
top_venues_per_city_onehot = pd.get_dummies(top_venues_per_city[['Venue Category']])

top_venues_per_city_onehot['City'] = top_venues_per_city['City']

fixed_columns = [top_venues_per_city_onehot.columns[-1]] + list(top_venues_per_city_onehot.columns[:-1])
top_venues_per_city_onehot = top_venues_per_city_onehot[fixed_columns]

top_venues_per_city_onehot.head()

Unnamed: 0,City,Venue Category_ATM,Venue Category_Accessories Store,Venue Category_Advertising Agency,Venue Category_African Restaurant,Venue Category_American Restaurant,Venue Category_Antique Shop,Venue Category_Art Gallery,Venue Category_Arts & Crafts Store,Venue Category_Asian Restaurant,...,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Video Game Store,Venue Category_Video Store,Venue Category_Warehouse Store,Venue Category_Waterfall,Venue Category_Wine Bar,Venue Category_Wings Joint,Venue Category_Yoga Studio,Venue Category_Zoo,Venue Category_Zoo Exhibit
0,Cook,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,Cook,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Cook,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cook,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Maricopa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And we group this venues by City and get the mean of everyone

In [15]:
top_venues_grouped_by_city = top_venues_per_city_onehot.groupby('City').mean().reset_index()

In [16]:
top_venues_grouped_by_city.head()

Unnamed: 0,City,Venue Category_ATM,Venue Category_Accessories Store,Venue Category_Advertising Agency,Venue Category_African Restaurant,Venue Category_American Restaurant,Venue Category_Antique Shop,Venue Category_Art Gallery,Venue Category_Arts & Crafts Store,Venue Category_Asian Restaurant,...,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Video Game Store,Venue Category_Video Store,Venue Category_Warehouse Store,Venue Category_Waterfall,Venue Category_Wine Bar,Venue Category_Wings Joint,Venue Category_Yoga Studio,Venue Category_Zoo,Venue Category_Zoo Exhibit
0,Alachua,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alameda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Allegheny,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Anne Arundel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Baltimore,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now, we create a function to get the 10 top venues per City to visualize each place

In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [18]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append(f'{ind+1}{indicators[ind]} Most Common Venue')
    except:
        columns.append(f'{ind+1}th Most Common Venue')

# create a new dataframe
top_venues_sorted = pd.DataFrame(columns=columns)
top_venues_sorted['City'] = top_venues_grouped_by_city['City']

for ind in np.arange(top_venues_grouped_by_city.shape[0]):
    top_venues_sorted.iloc[ind, 1:] = return_most_common_venues(top_venues_grouped_by_city.iloc[ind, :], num_top_venues)

top_venues_sorted.head()

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Alachua,Venue Category_Seafood Restaurant,Venue Category_Intersection,Venue Category_Dog Run,Venue Category_Food,Venue Category_Flower Shop,Venue Category_Flea Market,Venue Category_Financial or Legal Service,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market,Venue Category_Farm
1,Alameda,Venue Category_Trail,Venue Category_Zoo Exhibit,Venue Category_Dog Run,Venue Category_Food,Venue Category_Flower Shop,Venue Category_Flea Market,Venue Category_Financial or Legal Service,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market,Venue Category_Farm
2,Allegheny,Venue Category_New American Restaurant,Venue Category_Check Cashing Service,Venue Category_Gym,Venue Category_Café,Venue Category_Zoo Exhibit,Venue Category_Dry Cleaner,Venue Category_Flower Shop,Venue Category_Flea Market,Venue Category_Financial or Legal Service,Venue Category_Fast Food Restaurant
3,Anne Arundel,Venue Category_Campground,Venue Category_Bike Trail,Venue Category_Dry Cleaner,Venue Category_Food Truck,Venue Category_Food,Venue Category_Flower Shop,Venue Category_Flea Market,Venue Category_Financial or Legal Service,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market
4,Baltimore,Venue Category_Italian Restaurant,Venue Category_Mobile Phone Shop,Venue Category_Asian Restaurant,Venue Category_Lighting Store,Venue Category_Salon / Barbershop,Venue Category_Sushi Restaurant,Venue Category_Dessert Shop,Venue Category_Pizza Place,Venue Category_Shopping Mall,Venue Category_Farmers Market


And we are ready to start the clustering process! Let's cluster by City, searching 10 clusters and using KMeans

In [19]:

# set number of clusters
kclusters = 10

top_venues_clustered = top_venues_grouped_by_city.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(top_venues_clustered)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 4, 1, 1, 1, 1, 1, 1, 1, 1])

And we get the cluster labels and put them in ur principal Dataframe to show that clusters in a map

In [20]:
# add clustering labels
top_venues_sorted.insert(0, 'Cluster', kmeans.labels_)

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
top_covid_cities_final = pd.merge(top_covid_cities, top_venues_sorted, on='City')

top_covid_cities_final.head() # check the last columns!

Unnamed: 0,State,City,Lat,Long,Total,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Illinois,Cook,41.841448,-87.816588,140623,1,Venue Category_Hot Dog Joint,Venue Category_Asian Restaurant,Venue Category_Baseball Field,Venue Category_Grocery Store,Venue Category_Electronics Store,Venue Category_Food Truck,Venue Category_Food,Venue Category_Flower Shop,Venue Category_Flea Market,Venue Category_Financial or Legal Service
1,Arizona,Maricopa,33.348359,-112.491815,140409,1,Venue Category_Golf Driving Range,Venue Category_Farm,Venue Category_Zoo Exhibit,Venue Category_Dry Cleaner,Venue Category_Food Truck,Venue Category_Food,Venue Category_Flower Shop,Venue Category_Flea Market,Venue Category_Financial or Legal Service,Venue Category_Fast Food Restaurant
2,Texas,Harris,29.858649,-95.393395,139017,1,Venue Category_Food,Venue Category_Financial or Legal Service,Venue Category_Zoo Exhibit,Venue Category_French Restaurant,Venue Category_Food Truck,Venue Category_Flower Shop,Venue Category_Flea Market,Venue Category_Fast Food Restaurant,Venue Category_Farmers Market,Venue Category_Farm
3,Texas,Dallas,32.766706,-96.77796,78723,1,Venue Category_Playground,Venue Category_Rental Car Location,Venue Category_College Academic Building,Venue Category_Art Gallery,Venue Category_Coffee Shop,Venue Category_Dog Run,Venue Category_Zoo Exhibit,Venue Category_Donut Shop,Venue Category_Flower Shop,Venue Category_Flea Market
4,New York,Queens,40.710881,-73.816847,71881,1,Venue Category_Pizza Place,Venue Category_Shipping Store,Venue Category_Bank,Venue Category_Donut Shop,Venue Category_Deli / Bodega,Venue Category_Sushi Restaurant,Venue Category_Sandwich Place,Venue Category_Diner,Venue Category_Bus Stop,Venue Category_Farm


We're ready to show our results in a map!

In [21]:
# create map
map_clusters = folium.Map(location=[37.6, -95.665], zoom_start=4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.inferno(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(top_covid_cities_final['Lat'], top_covid_cities_final['Long'], top_covid_cities_final['City'], top_covid_cities_final['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters