# Exploratory data analysis and vizualizations

### Tasks:
<table width="100%" height="90%">
    <tr>
        <td width="10%">
        <img src="https://img.icons8.com/plasticine/2x/task.png" alt="Tasks" width="100" height="100" align="left">
        </td>
        <td style="text-align: left">
        <font size="3">1. Run this jupyter notebook step by step and try to understand what the script does.</font><br>
        <font size="3">2. Find one ore more new data sets (e.g. on Kaggle) and replace the car data set.</font><br>
        <font size="3">3. Repeat the exploratory data analysis and vizualizations based on the new data.</font><br>
        <font size="3">4. For spatial data analysis, replace the attribute 'residents_per_km2' by a new attribute.</font><br>
        <font size="3">5. Repeat the spatial data exploration based on the new attribute.</font><br>
        <br>
        <font size="3">Save the jupyter notebook with your solutions as html-file and upload it to Moodle.</font>
        </td>
    </tr>
</table>

### Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns                       # visualisation
import matplotlib.pyplot as plt             # visualisation    
sns.set(color_codes=True)

import warnings
warnings.filterwarnings("ignore")


### Loading the data into a data frame

In [2]:
# Car dataset (source: https://www.kaggle.com/CooperUnion/cardataset)
df = pd.read_csv("data.csv")

# Shape (number of rows and columns)
print(df.shape)

# To display the top 5 rows 
df.head(5)
# df.tail(5)


FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

### Checking the data types

In [None]:
df.dtypes


### Dropping the irrelevant columns

In [None]:
df = df.drop(['Engine Fuel Type', 'Market Category', 'Popularity', 'Number of Doors', 'Vehicle Size'], axis=1)
df.head(5)


### Renaming the columns

In [None]:
df = df.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","Vehicle Style": "Vehicle_Style", "highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price" })
df.head(5)

### Dropping the duplicate rows

In [None]:
# Number of rows and columns
df.shape


In [None]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)


In [None]:
# Number of rows
df.count()


In [None]:
# Remove duplicates
df = df.drop_duplicates()
df.head(5)


In [None]:
df.count()


### Counting and dropping the missing values

In [None]:
# Showing the missing values
print(df.isna().sum())


In [None]:
# Dropping the missing values
df = df.dropna()
df.count()


In [None]:
# After dropping the missing values
print(df.isna().sum())


### Showing summary statistics of (cleaned up) variables

In [None]:
df.describe()


### Using boxplots for outlier detection 

In [None]:
# Boxplot showing prices of cars
plt.figure(figsize=(8,2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=df['Price'], color="green")


In [None]:
# Take a closer look at the very expansive cars
df[df.Price >= 500000]


In [None]:
# Boxplot showing horse power of cars (HP)
plt.figure(figsize=(8,2))
sns.boxplot(x=df['HP'], color="orange")


In [None]:
# Take a closer look at the cars with very high horse power
df[df.HP >= 600]


### Calculate quantiles to obtain information about the distribution of each variable 

In [None]:
Q1 = df.quantile(0.25)
Q2 = df.quantile(0.50)
Q3 = df.quantile(0.75)
print(Q1)
print("---------------------------")
print(Q2)
print("---------------------------")
print(Q3)


### Plotting a histogram to show the distribution of a variable

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Mean and Median HP
print('Mean of HP: ' +str(int(np.mean(df['HP']))))
print('Median of HP: ' +str(int(np.median(df['HP']))))

# Plot Histogram
fig = plt.figure( figsize=(10,5) )
plt.xticks(fontsize=14, rotation=0)
plt.yticks(fontsize=14, rotation=0)
n, bins, patches = plt.hist(x=df['HP'], 
                            bins=20, 
                            color='#42AD12',
                            alpha=1, 
                            rwidth=0.95
                   )
plt.grid(True)
plt.ticklabel_format(style='plain')
plt.grid(axis='y', alpha=0.75)
plt.xlabel('HP', fontsize=14, labelpad=10)
plt.ylabel('Frequency', fontsize=14, labelpad=10)
plt.title('Histogram of Horse Power', fontsize=16, pad=10)


### Using a density plot to show the distribution of a variable

In [None]:
import seaborn as sns

plt.figure(figsize=(10,5))
sns.distplot(df['HP'], 
             hist=True, 
             kde=True, 
             bins=20, 
             color = 'darkred',
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 2},
            )
plt.title('Density plot of HP', fontsize=16, pad=10)
plt.xlabel('HP', fontsize=14, labelpad=10)
plt.ylabel('Density', fontsize=14, labelpad=10)
plt.grid(True)
plt.show()


### Plotting a barchart to show the number of observations per category

In [None]:
df.Make.value_counts().nlargest(20).sort_values(ascending=True).plot(kind='barh', width=0.9, figsize=(9,6), color="darkorange")
plt.title("Number of cars by make", fontsize=18, pad=20)
plt.xlabel('Number of cars', fontsize=14, labelpad=10)
plt.ylabel('Make', fontsize=14, labelpad=10)


### Using a scatterplot to explore the relationship between two variables

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
plt.ticklabel_format(style='plain')
ax.scatter(df['HP'], df['Price'], color="green", alpha=1.0)
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['Cylinders'], df['HP'], color="darkblue", alpha=1.0)
ax.set_xlabel('Cylinders')
ax.set_ylabel('HP')
plt.show()


### Using a scatterplot-matrix to explore the relationships between more than two variables

In [None]:
import seaborn as sns
sns.set(style="ticks")

g = sns.PairGrid(df[['Price','HP','Cylinders', 'MPG-H', 'MPG-C']], height=1.9, aspect=1)
g.map_upper(sns.scatterplot,color='darkblue')
g.map_lower(sns.scatterplot, color='darkblue')
g.map_diag(plt.hist, color='brown')


### Using a heat map to show the relationships between more than two variables

In [None]:
plt.figure(figsize=(10,5))
c = df.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c


### Using bubble plots to vizualize data 

In [None]:
plt.figure(figsize=(12,5))
plt.ticklabel_format(style='plain')
cmap = sns.cubehelix_palette(dark=.3, light=3, as_cmap=True)
ax = sns.scatterplot(x="HP", 
                     y="Price",
                     size="HP",
                     hue="Cylinders",
                     palette="Set2",
                     data=df)


### Radar chart to vizualize data

In [None]:
# Libraries
import plotly.express as px
import pandas as pd

df_sub    = df[['Vehicle_Style','HP']]
df_sub_02 = df_sub.groupby(df_sub['Vehicle_Style']).mean()
df_sub_02

fig = px.line_polar(df_sub_02, r='HP', 
                    theta=df_sub_02.index, 
                    line_close=True
                   )
fig.update_traces(fill='toself')
fig.update_layout(width=500,height=500)
fig.show()


## Exploring spatial data

### Importing and exploring polygon-map in geojson format

In [None]:
import os
import geopandas as pd

# Polygonmap as .json-File
polys = pd.read_file("GEN_A4_GEMEINDEN_2019_epsg4326.json")

# Show data structure and first records
print("nrows, ncols", polys.shape)
print("-------------------------------------------------------")
print("Type:", type(polys))


In [None]:
# Object is GeoDataFrame (includes geometry of polygons)
polys


#### Plotting the map

In [None]:
import folium 

# Initializing the map
m = folium.Map(location=[47.44, 8.65], zoom_start=10)

# Map settings
folium.Choropleth(
    geo_data=polys,
    name='polys',
    fill_color='yellow'
).add_to(m)

folium.LayerControl().add_to(m)

# Plot map
m


#### Plotting a subset of the map

In [None]:
# Subset of map with index
idx = polys[polys['NAME'] == 'Winterthur'].index[0]
polys.iloc[[idx]]


In [None]:
# Plot subset of map
import folium 
import numpy as np

# Initializing the map
m     = folium.Map(location=[47.44, 8.65], zoom_start=10)

# Map settings
folium.Choropleth(
    geo_data=polys.iloc[[idx]],
    name='polys',
    fill_color='yellow'
).add_to(m)

folium.LayerControl().add_to(m)

# Plot map
m


### Importing and exploring attribute data

In [None]:
import pandas as pd
data = pd.read_excel('municipalities_kt_zh_data.xlsx', index_col=None)

print(type(data))

data.head(5)


In [None]:
data.describe()


### Scatterplot matrix of attribute data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")

g = sns.PairGrid(data[['residents', 'percentage_foreigners', 'residents_per_km2']], height=2.5, aspect=1)
g.map_upper(sns.scatterplot,color='darkgreen')
g.map_lower(sns.scatterplot, color='darkgreen')
g.map_diag(plt.hist, color='orange')


### Using a choropleth map to explore the spatial pattern of a variable

In [None]:
import folium 
from folium.folium import Map
import pandas as pd
import branca.colormap as cm
from IPython.display import HTML

# Function for hiding original legend
def folium_del_legend(choropleth: folium.Choropleth):
    del_list = []
    for child in choropleth._children:
        if child.startswith('color_map'):
            del_list.append(child)
            for del_item in del_list:
                choropleth._children.pop(del_item)
                return choropleth

# Geojson and data
polys = 'GEN_A4_GEMEINDEN_2019_epsg4326.json'
data  = pd.read_excel('municipalities_kt_zh_data.xlsx')

# Bins for color-range of the map
bins = list(data['residents_per_km2'].quantile([0.00, 0.25, 0.50, 0.75, 1.00]))

# Initializing the map
m = folium.Map(tiles='OpenStreetMap', location=[47.44, 8.65], zoom_start=10)

# Map-Settings
folium.Choropleth(
        geo_data=polys,
        name='choropleth',
        data=data,
        columns=['BFS', 'residents_per_km2'],
        key_on='feature.properties.BFS',
        fill_color='RdGy',
        fill_opacity=0.7,
        line_opacity=0.5,
        legend_name='Number of residents',
        bins=bins,
        reset=True
).add_to(m)

# Layer controls
folium.LayerControl(collapsed=True).add_to(m)

# Plot map
m

# Save map as HTML
# m.save('map.html')


### Further readings and tips

#### Seaborn:
https://jakevdp.github.io/PythonDataScienceHandbook/04.14-visualization-with-seaborn.html

#### Matplotlib:
https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/

#### Exploratory data analysis
https://towardsdatascience.com/15-data-exploration-techniques-to-go-from-data-to-insights-93f66e6805df

#### Spatial data and maps:
https://ipyleaflet.readthedocs.io/en/latest/

https://python-visualization.github.io/folium/quickstart.html

https://deparkes.co.uk/2016/06/10/folium-map-tiles/

https://nbviewer.jupyter.org/gist/talbertc-usgs/18f8901fc98f109f2b71156cf3ac81cd

https://www.nagarajbhat.com/post/folium-visualization

https://ocefpaf.github.io/python4oceanographers/blog/2015/03/23/wms_layers/