# Part 2 Data exploration by unsupervised learning

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) ## suppress annoying deprecation warnings

import pandas as pd
import seaborn.objects as so
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style

from sklearn.preprocessing import StandardScaler

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
# Renaming columns for better axis labels in plots
col_rename = {
	'tavg': 'Temp_Avg_°C',
	'tmax': 'Temp_Max_°C',
	'tmin': 'Temp_Min_°C',
	'rhum': 'Rel_Humidity_%',
	'coco': 'Condition',
	'wspd': 'Wind_Speed_kmh',
	'prcp': 'Precipation_mm',
	'wdir': 'Wind_Direction_°',
	'pres': 'Air_pressure_hPa',
	'dwpt': 'Dew_point_°C'
}

In [None]:
## Reload data
weather_df = pd.read_csv('global_weather.csv', parse_dates=['time'], dtype={'wmo':str, 'station':str}) 
weather_df = weather_df.dropna()

weather_df.rename(columns=col_rename, inplace=True)

weather_df = weather_df.assign(Continent = weather_df["timezone"].str.split('/').str[0])  ## Get continent from timezone column

## Dimension reduction by PCA and t-SNE

In [None]:
weather_df.select_dtypes(include='number') ## Only on numerical attributes of weather data 

In [None]:
scaler = StandardScaler().set_output(transform="pandas") # Features need to be scaled for dimension reduction

#!# weather_d2_tsne = TSNE(n_components=?, perplexity=100, random_state=42).fit_transform(			## Define and perform dimension reduction
							scaler.fit_transform(weather_df.select_dtypes(include='number')) 	## Scale Data first; only numerical features can be used
							)
#!# weather_d2_pca = PCA(n_components=?).fit_transform(
							scaler.fit_transform(weather_df.select_dtypes(include='number'))
							)

weather_df = weather_df.assign(TSNE1= weather_d2_tsne[:,0]) ## Add as additional columns for easy plotting
weather_df = weather_df.assign(TSNE2= weather_d2_tsne[:,1])

weather_df = weather_df.assign(PC1= weather_d2_pca[:,0])
weather_df = weather_df.assign(PC2= weather_d2_pca[:,1])

## Scatter plots
### (1) simple plot with no additional information

In [None]:
(
    so.Plot(weather_df)
#!# .pair(x=["TSNE1","PC1"], y=["TSNE2","PC2"], cross=??) # pair as cousin of facet (sharing of axes)
    .layout(size=(12, 6))
    .add(so.Dot())
)

### (2) After reducing to two dimension we can overlay with numerical and categorical features for exploration of potential clusters

In [None]:
(
#!# so.Plot(weather_df, color=??)
    .pair(x=["TSNE1","PC1"], y=["TSNE2","PC2"], cross=False)
    .layout(size=(12, 6))
    .add(so.Dot())
)

### (3) How about atmospheric pressure?

Interestingly, intuitive color assignment for low/high pressure seems to be reversed in US (https://as1.ftcdn.net/v2/jpg/02/74/36/92/1000_F_274369230_Kvnl7UddxtOPCV0CDMwX6I0y6m1GezDo.jpg) compared to Germany (https://bluewhale.ch/wp-content/uploads/2020/02/Profi-Isobaren-Wetter-Karte.jpg)

In [None]:
(
    so.Plot(weather_df, color="Air_pressure_hPa")
    .pair(x=["TSNE1","PC1"], y=["TSNE2","PC2"], cross=False)
#!# .scale(color=so.Continuous(??).tick(upto=10)) ## Again important to choose an intuitive colormap. 
    .layout(size=(12, 6))
    .add(so.Dot())
	.theme({**style.library["dark_background"]})  ## If midpoint is white, a dark background is desirable
)

## Tackling the overplotting issue in scatter plots

Strategies:
1) smaller markers
2) transparency to show density
3) marker edges 

In [None]:
(
    so.Plot(weather_df, x="TSNE1", y="TSNE2", color="Temp_Avg_°C")
#!#	.add(so.Dot(alpha=??, pointsize=??, edgecolor="black")) # adjusting pointsize, introduce transparency, edges increase visibility (dark, bright)
	.scale(color=so.Continuous("Spectral_r").tick(upto=7)) # reverse
	.layout(size=(6, 6))
)

## Avoid overplotting by facet subgroups

In [None]:
(
	so.Plot(weather_df, x="TSNE1", y="TSNE2", color="Rel_Humidity_%")
	# .facet(??, wrap=4).layout(size=(18, 8))
    .add(so.Dot(alpha=0.3, pointsize=4, edgecolor="black"))
	.scale(color="cividis_r")
)

## Alternative: 2D density and histogram plots (unfortunately not available via seaborn.object API)

In [None]:
f, axes = plt.subplots(2,4, figsize=(18, 6), sharex=True, sharey=True)

sns.scatterplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,0], alpha=0.05)	## Scatter plot with very low alpha
sns.histplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,1])					## Tiles showing density (low resolution)
sns.kdeplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,2])					## Isoclines of density (high resolution)
#!# sns.kdeplot(data=weather_df ,x="TSNE1", y="TSNE2", ax=axes[0,3], fill=True, thresh=0, levels=??, cmap="Blues",)	## Filled density (high resolution) 
## try out high number of levels -> what is maximum number you can discriminate before blurring out?


## For a single continent for comparison 
sns.scatterplot(data=weather_df.where(weather_df['Continent'] == "Europe") ,x="TSNE1", y="TSNE2", ax=axes[1,0], alpha=0.05)
sns.histplot(data=weather_df.where(weather_df['Continent'] == "Europe") ,x="TSNE1", y="TSNE2", ax=axes[1,1])
sns.kdeplot(data=weather_df.where(weather_df['Continent'] == "Europe") ,x="TSNE1", y="TSNE2", ax=axes[1,2])
sns.kdeplot(data=weather_df.where(weather_df['Continent'] == "Europe") ,x="TSNE1", y="TSNE2", ax=axes[1,3], fill=True, thresh=0, levels=10, cmap="Blues")

## Analysis of clusters

In [None]:
#!# clustering = KMeans(n_clusters=??, random_state=42).fit(
	StandardScaler().fit_transform(
		weather_d2_tsne
			)
		)

clustering.labels_

In [None]:
weather_df = weather_df.assign(Cluster= [str(x) for x in clustering.labels_]) ## Add Cluster labels to dataframe for plotting

(
	so.Plot(weather_df, x="TSNE1", y="TSNE2", color="Cluster", marker="Continent") # Try out visualize cluster assignment by color and marker
    .add(so.Dot(alpha=0.8, pointsize=6, edgecolor="black"))	
	.scale(color="pastel") # Important to use a colormap for categorical data
)

### Let's have a look on Cluster if it is enriched with European cities

In [None]:
(
#!# so.Plot(weather_df.loc[weather_df["Cluster"]== "0",:], x="TSNE1", y="TSNE2", ??="Continent")  # try marker and color
    .add(so.Dot(alpha=0.5, pointsize=10, edgecolor="black"))	
	.scale(color="Set2")
)

# Homework and discussion: 
### Looking at the clustering in TSNE plots. How to check and plot city assignment to clusters? Are cities always in the same cluster?
### What is driving cluster assigment? Regions, day/night, weather ...