In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import folium # plotting library
from folium import plugins

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Introduction

### Firstly, we will take a quick look into our data in order to understand with what we are working with! And, then, we will clean/filter it! 

### Afterwards, we will apply k-Means clustering in order to identify similar airports based on the number of occurences of weather events that had happened in that particular airport! Plus, we are going to use Principal Component Analysis in order to visualise high dimensional data, so that, we can see how our clusters are related in the original space.

### Finally, the final result of clustered airports will be illustrated using the Seaborn library and Folium  Maps

# Data Overview

In [None]:
df = pd.read_csv('../input/us-weather-events/US_WeatherEvents_2016-2019.csv')

df.head()

In [None]:
df['Type'].value_counts()

In [None]:
df['Severity'].value_counts()

## Data prep for k-Means clustering

### Let's filter our data discarding the events that has severity as 'unk' or 'other'

In [None]:
df = df[(df['Severity'] != 'UNK') & (df['Severity'] != 'Other')]

df.head()

In [None]:
df_types = df[['AirportCode','Type']]

df_types.head()

### Here, we are going to group the occurences for each airport!

In [None]:
types = pd.get_dummies(df_types['Type'])

types['AirportCode'] = df_types['AirportCode']

types = types.groupby('AirportCode').sum().reset_index()

types.head()

# k-Means Clustering

In [None]:
codes = types[['AirportCode']]
types.drop('AirportCode', axis=1, inplace=True)

### In order to identify the optimal number of clusters, we need to use the Elbow Method! When the slope of the tangent line starts to be almost horizontal, that is the optimal number of cluster!

In [None]:
distortions = []

K = range(1,20)
for k in K:
    kmean = KMeans(n_clusters=k, random_state=0, n_init = 50, max_iter = 500)
    kmean.fit(types)
    distortions.append(kmean.inertia_)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method')
plt.show()

### The elbow method seems to suggest 4 or 5 clusters!

In [None]:
# run k-means clustering
kmeans = KMeans(n_clusters=4, random_state=0).fit(types)

codes['cluster'] = kmeans.labels_
codes.head()

### I am used to apply some dimensionality reduction techniques in order to visualise how our clusters are related in the original high dimensional space! Moreover, we are able to see if the features of our data are linear related among them.

In [None]:
pca = PCA().fit(types)
pca_types = pca.transform(types)
print("Variance explained by each component (%): ")
for i in range(len(pca.explained_variance_ratio_)):
      print("\n",i+1,"º:", pca.explained_variance_ratio_[i]*100)
print("Total sum (%): ",sum(pca.explained_variance_ratio_)*100)
print("Explained variance of the first two components (%): ",sum(pca.explained_variance_ratio_[0:1])*100)

### Since the number of samples are larger than the number of features, we are able to solve all 5 principal components (PC), leading to 100% of the original information being explained by these PC. 

### We can see that using the first two components we are able to preserve 63,65% of the original information, therefore, reducing the dimensionality of our data.

### Let's use these PC to visualise our clusters!

In [None]:
c0 = []
c1 = []
c2 = []
c3 = []

for i in range(len(pca_types)):
    if kmeans.labels_[i] == 0:
        c0.append(pca_types[i])
    if kmeans.labels_[i] == 1:
        c1.append(pca_types[i])
    if kmeans.labels_[i] == 2:
        c2.append(pca_types[i])
    if kmeans.labels_[i] == 3:
        c3.append(pca_types[i])
        
        
c0 = np.array(c0)
c1 = np.array(c1)
c2 = np.array(c2)
c3 = np.array(c3)

plt.figure(figsize=(7,7))
plt.scatter(c0[:,0], c0[:,1], c='red', label='Cluster 0')
plt.scatter(c1[:,0], c1[:,1], c='blue', label='Cluster 1')
plt.scatter(c2[:,0], c2[:,1], c='green', label='Cluster 2')
plt.scatter(c3[:,0], c3[:,1], c='black', label='Cluster 3')
plt.legend()
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Low dimensional visualization (PCA) - Airports');

### We see that 4 clusters seems to be reasonable to identify similar samples within our data!

### Let's take a look in the particularity of each cluster using seaborn library!

In [None]:
types['cluster']  = kmeans.labels_

types.head()

In [None]:
types.groupby('cluster').mean()

In [None]:
sns.catplot(x='cluster', y='Cold', data=types, kind='bar');

In [None]:
sns.catplot(x='cluster', y='Fog', data=types, kind='bar');

In [None]:
sns.catplot(x='cluster', y='Rain', data=types, kind='bar');

In [None]:
sns.catplot(x='cluster', y='Snow', data=types, kind='bar');

In [None]:
sns.catplot(x='cluster', y='Storm', data=types, kind='bar');

### Looking into these plots we can see that cluster 0 is the most affected by snow and cold! And cluster 3 is the most affected by rains!

# Folium Maps Visualisation by Number of Occurences and Clustering

### Firstly, we need to create a map of USA

### We are going to plot two maps: the first one will display airports by their number of weather events that occured in that airport! The size of each mark (of each airport) will vary accordingly to these numbers. The second map will show us the clusters that we had acquired through k-Means!


In [None]:
latitude = 38.500000
longitude = -95.665

map_USA = folium.Map(location=[latitude, longitude], zoom_start=4)

map_USA

In [None]:
airports = df[['AirportCode', 'LocationLat','LocationLng','City','State']]

airports.head()

In [None]:
number_of_occurences = pd.DataFrame(airports['AirportCode'].value_counts())
number_of_occurences.reset_index(inplace=True)
number_of_occurences.columns = ['AirportCode', 'Count']
number_of_occurences.head()

In [None]:
number_of_occurences = number_of_occurences.merge(airports.drop_duplicates())

number_of_occurences = number_of_occurences.merge(codes)

number_of_occurences.head()

In [None]:
occurences = folium.map.FeatureGroup()
n_mean = number_of_occurences['Count'].mean()

for lat, lng, number, city, state in zip(number_of_occurences['LocationLat'],
                                         number_of_occurences['LocationLng'],
                                         number_of_occurences['Count'],
                                         number_of_occurences['City'],
                                         number_of_occurences['State'],):
    occurences.add_child(
        folium.vector_layers.CircleMarker(
            [lat, lng],
            radius=number/n_mean*5, # define how big you want the circle markers to be
            color='yellow',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6,
            tooltip = str(number)+','+str(city) +','+ str(state)
        )
    )

map_USA.add_child(occurences)

### We can see that the airports that had registered the greatest number of occurences are in the north of the West Coast!

### But, in general, the airports that are located far away from the coast had suffered less from weather events! However, the state of Colorado seems to be a exception to that :)

### Finally, let's see our clusters!

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=4)

# set color scheme for the clusters
x = np.arange(4)
ys = [i + x + (i*x)**2 for i in range(4)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lng, cluster, city, state in zip(number_of_occurences['LocationLat'], number_of_occurences['LocationLng'],  
                                            number_of_occurences['cluster'],
                                         number_of_occurences['City'],
                                         number_of_occurences['State']):
    #label = folium.Popup(str(city)+ ','+str(state) + '- Cluster ' + str(cluster), parse_html=True)
    folium.vector_layers.CircleMarker(
        [lat, lng],
        radius=5,
        #popup=label,
        tooltip = str(city)+ ','+str(state) + '- Cluster ' + str(cluster),
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map_clusters)
       
map_clusters

### We can see that cluster 0, the one most affected by snow events, are mainly located to the north border of USA, very close to Canada!

### And here I conclude this notebook suggesting some future work! Maybe the use of 5 clusters may reveal to us a better look into our data. It seems to me that cluster 1 are not well defined! Furthermore, a better investigation of each cluster may show us other differences among them!

### Thank you,
### Lucas