# 05 Clustering

Due to NDA agreements no data can be displayed.

In [None]:
#loading packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import  KMeans, 
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px

Data was saved in a csv file

In [None]:
df = pd.read_csv('../data/Featureselection03.csv')
print(df.shape)
df.head()

So that everyone is on track with the feature selection, we created another csv file to rate the importance and only use important features for training our models and further analysis.

In [None]:
# read list with feature importance
data_log = pd.read_csv('../data/Capstone_features_Features.csv')
data_log.head()

### Create data frame with important features

Only Features with a feature importance smaller than 3 were selected.

In [None]:
# create list of important features (feature importance < 3)
list_imp_feat = list(data_log[data_log['F_Imp_new'] < 3]['VarName'])
len(list_imp_feat)

The feature passage_type will be included into the list of important features to be able to compare the cluster with the passage type.

In [None]:
list_imp_feat.append('passage_type')

In [None]:
df.head()

### Fill and drop NaN

Values for V.SLPOG.act.PRC and ME.SFCI.act.gPkWh cointain missing values. In the EDA it was observable that it makes sense to put 0 in these places.

In [None]:
df['V.SLPOG.act.PRC'].fillna(0,inplace=True)
df['ME.SFCI.act.gPkWh'].fillna(0,inplace=True)

All other rows with missing values were dropped.

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

For the clustering we wanted to get a better idea of the differences in the Atlantic. Therefore only values with the passage_type Atlantic were taken.

In [None]:
df_atlantic = df[df['passage_type'] == 'Atlantic']

To be able to use the Entry Date later on it is saved in a list

In [None]:
list_entrydate = df_atlantic['EntryDate']

All features with object Data Types are getting dropped.

In [None]:
#df_atlantic = df_atlantic.drop(['passage_type', 'EntryDate'], axis = 1)
df_atlantic = df_atlantic.drop(['passage_type', 'EntryDate', 'Date_daily', 'Type_daily', 'trip_id'], axis = 1)

### Perform KMeans clustering

KMeans clustering is based on distance measurings. Therefore it is neccessary to scale the data.

In [None]:
# Use the MinMaxScaler to scale the data 
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_atlantic)
df_scaled

In [None]:
# Elbow Plot to identify Nr of clusters (WCSS = within-cluster sum of squares)
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=150, n_init=10, random_state=0)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In the elbow plot and from the EDA it seems like 5 is a good number of clusters. Therefore a KMeans Clustering with 5 clusters is trained.

In [None]:
# Creating the KMeans Clustering model and fit it to our data X_pca after Dimensionality Reduction
kmeans = KMeans(n_clusters=5, verbose=0, init='k-means++', max_iter=150, n_init=20, random_state=0)
kmeans.fit(df_scaled)

In [None]:
kmeans.labels_

### Analyze the Clustering

The labels of the clusters is added to the dataframe.

In [None]:
df_atlantic['Cluster'] = kmeans.labels_
df_atlantic.head()

In [None]:
df_atlantic['Cluster'] = df_atlantic['Cluster'].astype(str)

To get an idea of the differences between the cluster, the mean value for each feature is analysed.

In [None]:
cluster_mean = df_atlantic.groupby('Cluster').mean().reset_index()

Entry Date is again added to the dataframe.

In [None]:
df_atlantic['EntryDate'] = list_entrydate

The clusters were analyzed by comparing it to the map values.

In [None]:
fig = px.scatter_mapbox(df_atlantic,
                        lat='V.GPSLAT.act.deg',lon='V.GPSLON.act.deg',color='Cluster',text='EntryDate',
                        width=1000, height=600, 
                        title='GUAYAQUIL EXPRESS during observation period', 
                        zoom = 2,
                        labels={'V.GPSLAT.act.deg':'Latitude','V.GPSLON.act.deg':'Longitude','Cluster':'Cluster','EntryDate':'Date'},
                        color_discrete_sequence=px.colors.qualitative.Safe, range_color=(0,df_atlantic['Cluster'].max()))
fig.update_layout(mapbox_style="open-street-map",
                  title_font_family="Arial",
                  title_font_color="grey",
                  title_font_size=24,
                  title_x=0.5,
                  legend=dict(title_font_family="Arial",
                                title_font_size=20,
                                title_font_color="grey",
                                font=dict(family="Arial",
                                            size=18,
                                            color="grey")))