In [None]:
import urllib.request, urllib.parse, urllib.error
import json
import ssl
import pandas as pd

In [None]:
# creating a pandas dataframe

column_names = ['ID', 'English_name','Locality', 'Date', 'Time', 'Latitude', 'Longitude']
df = pd.DataFrame(columns=column_names)

In [None]:
#dowloading a data from xeno-canto

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

xeno_canto__API_v2 = 'https://www.xeno-canto.org/api/2/recordings?query=cnt:united_kingdom'

numPages = 1
page = 0

while page < numPages:
    page += 1
    url = xeno_canto__API_v2+'&page='+str(page)
    uh = urllib.request.urlopen(url, context=ctx)
    data = uh.read().decode()
    if uh.getcode() != 200 :
            print("Error on downloading: ", url, uh.getcode())
            exit
    try:
        js = json.loads(data)
    except:
        js = print('=====Error on loading JSON======')

    numPages = js['numPages']
    page = js['page']
    numRecordings = js['numRecordings']
    
    for item in js['recordings']:
        df = df.append({'ID' : item['id'],
                    'English_name' : item['en'],
                    'Locality': item['loc'],
                    'Latitude' : item['lat'],
                    'Longitude' : item['lng'],
                    'Date' : item['date'],
                    'Time' : item['time'] , }, ignore_index=True)

In [None]:
# Saving results

df.to_csv('df.csv', index=False)

Data wrangling

In [None]:
import numpy as np

In [None]:
#Reading dataframe

df = pd.read_csv('df.csv')

#Data wrangling

missing_data = df.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")    

df.replace("?", np.nan, inplace = True)
df.dropna(subset=['Latitude'], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

Adding to dataframe column of location county through ArcGIS with geocoder package

In [None]:
!conda install -c conda-forge geocoder --yes 
import geocoder

In [None]:
County = []

for index, row in df.iterrows():
    try:
        g = geocoder.arcgis([row['Latitude'], row['Longitude']], method='reverse')
        County.append(g.json["raw"]["address"]["Subregion"])
    except:
        County.append('Error')

df['County'] = County

In [None]:
#Saving results
df.to_csv('df_arcgis.csv', index=False)

Finding additional information about location county from xeno-canto dataset 

In [None]:
#Creating a list of counties from geojson fail (https://data.gov.uk/dataset/d6f97a1a-25dc-485c-9af3-0e5681465d77/counties-and-unitary-authorities-december-2016-full-clipped-boundaries-in-england-and-wales)

with open('Counties_and_Unitary_Authorities_December_2016_Full_Clipped_Boundaries_in_England_and_Wales.geojson') as f:
    data = json.load(f)
  
Counties = []
for item in data['features']:
    Counties.append(item['properties']['ctyua16nm'])

In [None]:
County = []

for index, row in df.iterrows():
    if row['County'] in Counties:
        County.append(row['County'])        
    elif len(row['Locality'].split(', ')) >= 1  and row['Locality'].split(', ')[0] in Counties:
        County.append(row['Locality'].split(', ')[0])    
    elif len(row['Locality'].split(', ')) >= 2 and row['Locality'].split(', ')[1] in Counties:        
        County.append(row['Locality'].split(', ')[1])
    elif len(row['Locality'].split(', ')) >= 3 and row['Locality'].split(', ')[2] in Counties:
        County.append(row['Locality'].split(', ')[2])
    else:
        County.append(None)  

df['County'] = County

In [None]:
#Saving results
df.to_csv('df_agg.csv', index=False)

Creating map with initial results

In [None]:
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
!conda install -c conda-forge folium=0.5.0 --yes 
import folium
import json
!conda install -c conda-forge geocoder --yes 
import geocoder

In [None]:
# Reading previously saved data
df = pd.read_csv('df_agg.csv')

In [None]:
agg = pd.DataFrame(df['County'].value_counts().to_frame())
agg.reset_index(drop=False, inplace=True)
agg.rename(columns={'index' :'County', 'County':'Observations'}, inplace=True)
df = agg

In [None]:
#Saving data
df.to_csv('counties.csv')

In [None]:
#Loading geoJSON data
with open('Counties_and_Unitary_Authorities_December_2016_Full_Clipped_Boundaries_in_England_and_Wales.geojson') as f:
    ew_geo = json.load(f)

In [None]:
g = geocoder.arcgis('United Kingdom')
ew_map = folium.Map(location=[g.lat, g.lng], zoom_start=6, tiles='Mapbox Bright')

threshold_scale= [0, 100, 200, 300, 400, 500,600,700,int(df['Observations'].max())+1]

ew_map.choropleth(
    geo_data=ew_geo,
    data=df,
    columns=['County', 'Observations'],
    key_on='feature.properties.ctyua16nm',
    fill_color='Blues',
    fill_opacity=0.7,
    threshold_scale=threshold_scale,
    line_opacity=0.2,
    legend_name='Observations in England and Wales Counties')


In [None]:
#Saving map as html
m = ew_map
m.save("ew_map2.html")

Generating base dataframe for futher counties clustering

In [None]:
import pandas as pd
!conda install -c conda-forge geocoder --yes 
import geocoder

In [None]:
column_names = ['Country', 'County','Latitude', 'Longitude']
ew_data = pd.DataFrame(columns=column_names)

In [None]:
#Loading data
df_agg = pd.read_csv('df_agg.csv')

In [None]:
ew_data['County']=df_agg['County']. unique()
Latitude=[]
Longitude=[]

In [None]:
for c in ew_data.iterrows():
    try:
        a= c[1][1]+', UK'
        g = geocoder.arcgis(a)
        Latitude.append(g.lat)
        Longitude.append(g.lng)
    except:
        Latitude.append('')
        Longitude.append('')

In [None]:
ew_data['Latitude']=Latitude
ew_data['Longitude']=Longitude

In [None]:
#Saving data
ew_data.to_csv('ew_data.csv', index=False)

Preparation of k-mean clustering

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes 
import folium
!conda install -c conda-forge geocoder --yes 
import geocoder
import matplotlib.cm as cm
import matplotlib.colors as colors

In [None]:
df = pd.read_csv('df_agg.csv')
df.replace(np.nan,'?', inplace = True)
df = df[df.County != '?'].reset_index()

In [None]:
#Checking how many observations were returned for each county
df_sum = df.groupby('County').count()

In [None]:
# Let's find out how many unique species can be find from all the returned counties

print('There are {} uniques species.'.format(len(df['English_name'].unique())))

Analyzing Each County¶


In [None]:
# one hot encoding
ew_onehot = pd.get_dummies(df[['English_name']], prefix="", prefix_sep="")

In [None]:
# adding county column back to dataframe
ew_onehot['County'] = df['County']

In [None]:
# moving county column to the first column
fixed_columns = [ew_onehot.columns[-1]] + list(ew_onehot.columns[:-1])
ew_onehot = ew_onehot[fixed_columns]

In [None]:
# Grouping rows by county and by taking the mean of the frequency of occurrence of each species

ew_grouped = ew_onehot.groupby('County').mean().reset_index()

In [None]:
# Printing each county along with the top 5 most common species

num_top_species = 5

for county in ew_grouped['County']:
    print("----"+county+"----")
    temp = ew_grouped[ew_grouped['County'] == county].T.reset_index()
    temp.columns = ['species','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_species))
    print('\n')

In [None]:
# Sorting the species in descending order to use this later on dataframe.

def return_most_common_species(row, num_top_species):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_species]

In [None]:
# Creating the new dataframe and display the top 20 species for each county.
    
num_top_species = 20
indicators = ['st', 'nd', 'rd']

In [None]:
# Creating columns according to number of top species
columns = ['County']
for ind in np.arange(num_top_species):
    try:
        columns.append('{}{} Most Common Species'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Species'.format(ind+1))

In [None]:
# Creating a new dataframe
counties_species_sorted = pd.DataFrame(columns=columns)
counties_species_sorted['County'] = ew_grouped['County']

for ind in np.arange(ew_grouped.shape[0]):
    counties_species_sorted.iloc[ind, 1:] = return_most_common_species(ew_grouped.iloc[ind, :], num_top_species)

Clustering Counties

In [None]:
# Setting number of clusters
kclusters = 5
ew_grouped_clustering = ew_grouped.drop('County', 1)

In [None]:
# Running k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ew_grouped_clustering)

Creating a new dataframe that includes the cluster as well as the top 20 species for each county.

In [None]:
#reading counties coordinates from file
ew_data = pd.read_csv('ew_data.csv')

In [None]:
# adding clustering labels
counties_species_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
ew_merged = ew_data

In [None]:
# merge ew_merged with counties_species_sorted to add latitude/longitude for each county
ew_merged = ew_merged.join(counties_species_sorted.set_index('County'), on='County')

In [None]:
# Data cleaning 

ew_merged.dropna(subset=['County'], axis=0, inplace=True)
ew_merged.reset_index(drop=True, inplace=True)
ew_merged['Cluster Labels']= ew_merged['Cluster Labels'].astype(int)

Visualizing the resulting clusters

In [None]:
# creating map
g = geocoder.arcgis('United Kingdom')
map_clusters = folium.Map(location=[g.lat, g.lng], zoom_start=6)

In [None]:
# setting color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [None]:
# adding markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ew_merged['Latitude'], ew_merged['Longitude'], ew_merged['County'], ew_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)     


In [None]:
#Saving map as html
m = map_clusters
m.save("map_clusters.html")

 Examining Clusters

In [None]:
k1 =ew_merged.loc[ew_merged['Cluster Labels'] == 0, ew_merged.columns[[1] + list(range(5, ew_merged.shape[1]))]]
k2 =ew_merged.loc[ew_merged['Cluster Labels'] == 1, ew_merged.columns[[1] + list(range(5, ew_merged.shape[1]))]]
k3 =ew_merged.loc[ew_merged['Cluster Labels'] == 2, ew_merged.columns[[1] + list(range(5, ew_merged.shape[1]))]]
k4 =ew_merged.loc[ew_merged['Cluster Labels'] == 3, ew_merged.columns[[1] + list(range(5, ew_merged.shape[1]))]]
k5 =ew_merged.loc[ew_merged['Cluster Labels'] == 4, ew_merged.columns[[1] + list(range(5, ew_merged.shape[1]))]]

In [None]:
#Saving results
k1.to_csv('k1.csv')
k2.to_csv('k2.csv')
k3.to_csv('k3.csv')
k4.to_csv('k4.csv')
k5.to_csv('k5.csv')

Generating figures for raport

In [None]:
import numpy as np  
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

Generating counties histogram for raport

In [None]:
df = pd.read_csv('df_agg.csv')
df.replace(np.nan,'?', inplace = True)
df = df[df.County != '?'].reset_index()

In [None]:
df_sum = df.groupby('County').count()
df_sum = df_sum.drop(df.columns[[0,2,3,4,5,6,7]], axis=1)
df_sum = df_sum.drop(['North Yorkshire'])
df_sum.sort_values(['ID'], ascending=False, axis=0, inplace=True)
df_sum.rename(columns={'ID':'Observations'}, inplace=True)

In [None]:
count, bin_edges = np.histogram(df_sum)

ax = df_sum.plot(kind='hist', figsize=(8, 5), xticks=bin_edges)

plt.title('Histogram of Observations per County')
plt.ylabel('Number of Counties') 
plt.xlabel('Number of Observations') 

ax.get_legend().remove()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

In [None]:
plt.savefig('histogram.png')

Generating yearly distribution figure for raport

In [None]:
df_date = df.drop(df.columns[[0,2,3,5,6,7,8]], axis=1)

Month=[]

for i, m in df_date.iterrows():
    if '-' in m[1]:
        month = m[1].split('-')
        Month.append(month[1])
    elif '.' in m[1]:
        month = m[1].split('.')
        Month.append(month[1])
    else:
        Month.append('')
        
df_date['Month'] = Month
df_date['Month'] = df_date['Month'].astype(int)

df_date = df_date.groupby('Month').count()

df_date = df_date.drop(df.columns[[1]], axis=1)
df_date = df_date.drop([0])
df_date.rename(columns={'Date':'Observations'}, inplace=True)

In [None]:
ax = df_date.plot(kind='bar', figsize=(8, 5))

plt.title('Observations per Month') # add a title to the histogram
plt.ylabel('Number of Observations') # add y-label
plt.xlabel('Month') # add x-label
plt.xticks(rotation= 0 )


ax.get_legend().remove()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

In [None]:
plt.savefig('months.png')

Generating daily distribution figure for raport

In [None]:
df_time = df.drop(df.columns[[0,2,3,4,6,7,8]], axis=1)

Hour=[]

for i, m in df_time.iterrows():
    if ':' in m[1] and len(m[1]) == 5:
        hour = m[1].split(':')
        Hour.append(hour[0])
        #print(hour[0])
    else:
        Hour.append('')
    

df_time['Hour'] = Hour
df_time = df_time.groupby('Hour').count()

df_time = df_time.drop(df.columns[[1]], axis=1)
df_time.rename(columns={'Date':'Observations'}, inplace=True)
df_time = df_time.drop([''])

In [None]:
ax = df_time.plot(kind='bar', figsize=(8, 5))

plt.title('Observations per Hour') # add a title to the histogram
plt.ylabel('Number of Observations') # add y-label
plt.xlabel('Hour') # add x-label
plt.xticks(rotation= 0 )

ax.get_legend().remove()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

In [None]:
plt.savefig('hour.png')