# <font color="#49699E" size=40>Visualization & Exploratory Data Analysis</font>

# LEARNING OUTCOMES
# LEARNING MATERIALS


# INTRODUCTION


# ITERATIVE RESEARCH WORKFLOWS: EDA AND BOX'S LOOP


# EFFECTIVE VISUALIZATION

### Guidelines for Effective Visualization


# UNIVARIATE EDA: DESCRIBING AND VISUALIZING DISTRIBUTIONS


## Imports

In [ ]:
import os
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from dcss.plotting import format_axes_commas, custom_seaborn
custom_seaborn()

fsdf = pd.read_csv('../data/vdem/filtered_subset.csv') 
fsdf.shape

In [ ]:
egal = fsdf['v2x_egaldem']
print(f'Median Egalitarian Democracy Score: {egal.median()}')
print(f'Mean Egalitarian Democracy Score: {egal.mean()}')
print(f'Standard Deviation: {egal.std()}')

In [ ]:
print(f'Median Egalitarian Democracy Score: {round(egal.median(),3)}')
print(f'Mean Egalitarian Democracy Score: {round(egal.mean(), 3)}')
print(f'Standard Deviation: {round(egal.std(), 3)}')

In [ ]:
list(fsdf['country_name'].unique())[0:10]

In [ ]:
fsdf['country_name'].value_counts().head(10)

In [ ]:
fsdf['country_name'].value_counts().tail(10)

### Visualizing Marginal Distributions


#### Count Plots and Frequency Tables for Categorical Variables


In [ ]:
ax = sns.countplot(data=fsdf, y='e_regiongeo', color='darkgray')
sns.despine()
plt.show()

In [ ]:
ax = sns.countplot(data=fsdf, y='e_regiongeo', color='darkgray',
                   order = fsdf['e_regiongeo'].value_counts().index) 
sns.despine()
ax.set(xlabel='Number of Observations', ylabel='Geographic Region')
plt.show()

In [ ]:
region_strings = {
    1: "Western Europe",
    2: "Northern Europe",
    3: "Southern Europe",
    4: "Eastern Europe",
    5: "Northern Africa",
    6: "Western Africa",
    7: "Middle Africa",
    8: "Eastern Africa",
    9: "Southern Africa",
    10: "Western Asia",
    11: "Central Asia",
    12: "East Asia",
    13: "South-East Asia",
    14: "South Asia",
    15: "Oceania", # (including Australia and the Pacific)
    16: "North America",
    17: "Central America",
    18: "South America",
    19: "Caribbean" # (including Belize, Cuba, Haiti, Dominican Republic and Guyana)
}

In [ ]:
fsdf.replace({'e_regiongeo': region_strings}, inplace=True)

In [ ]:
ax = sns.countplot(data=fsdf, y='e_regiongeo', color='darkgray',
                   order = fsdf['e_regiongeo'].value_counts().index) # orders the bars
sns.despine(left=True)
ax.set(xlabel='Number of Observations', ylabel='')
ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}')) # comma formats x-axis
plt.show()

In [ ]:
fsdf['e_regiongeo'].value_counts()

#### Univariate Histograms and Density Estimation


In [ ]:
ax = sns.histplot(data=fsdf, x='v2x_egaldem')
sns.despine(left=True, right=True, top=True)
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Count')
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()

In [ ]:
ax = sns.histplot(data=fsdf, x='v2x_egaldem', bins=3)
sns.despine(left=True, right=True, top=True)
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Count')
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()

In [ ]:
ax = sns.histplot(data=fsdf, x='v2x_egaldem', binwidth = 0.001)
sns.despine(left=True, right=True, top=True)
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Count')
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()

In [ ]:
ax = sns.histplot(data=fsdf, x='v2x_egaldem', kde=True)
sns.despine(left=True, right=True, top=True)
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Count')
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()

In [ ]:
ax = sns.histplot(fsdf['v2x_egaldem'], kde=True, binwidth=4)
sns.despine(left=True, right=True, top=True)
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Count')
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()

#### Marginal Empirical Cumulative Distributions


#### Plotting Empirical Cumulative Distributions


In [ ]:
ax = sns.displot(fsdf, x="v2x_egaldem", kind="ecdf", color='darkgray')
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index')
plt.xlim(0, 1) 
plt.show()

# MULTIVARIATE EDA

### Visualizing Conditional Distributions


#### Conditional Histograms


In [ ]:
ax = sns.displot(fsdf, x="v2x_egaldem", col="e_boix_regime", multiple="dodge")
ax.set(xlabel='Egalitarian Democracy Index')
plt.show()

In [ ]:
grayscale_cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)

ax = sns.displot(fsdf, x="v2x_egaldem", hue="e_boix_regime", palette=grayscale_cmap)
ax.set(xlabel='Egalitarian Democracy Index')
plt.show()

#### Conditional KDE


In [ ]:
ax = sns.displot(fsdf, x="v2x_egaldem", hue="e_boix_regime", kde = True, palette=grayscale_cmap)
ax.set(xlabel='Egalitarian Democracy Index')
plt.show()

In [ ]:
ax = sns.displot(fsdf, x="v2x_egaldem", hue="e_boix_regime", kind = "kde", palette=cmap)
ax.set(xlabel='Egalitarian Democracy Index')
plt.show()

#### Conditional ECDs


In [ ]:
ax = sns.displot(fsdf, x="v2x_egaldem", kind = "ecdf", 
                 hue="e_boix_regime", palette=grayscale_cmap)
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index')
plt.xlim(0, 1) 
plt.show()

### VISUALIZING JOINT DISTRIBUTIONS


#### Cross Tables 


In [ ]:
ct = pd.crosstab(fsdf.e_regiongeo, fsdf.e_boix_regime)

#### Scatter Plots


In [ ]:
ax = sns.scatterplot(data = fsdf, x="v2x_egaldem", y="v2x_polyarchy")
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Polyarchy Index')
plt.xlim(0, 1) 
plt.ylim(0, 1) 
plt.show()

In [ ]:
ax = sns.scatterplot(data = fsdf, x="v2x_egaldem", y="v2x_polyarchy", alpha = 0.1)
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Polyarchy Index')
plt.xlim(0, 1) 
plt.ylim(0, 1) 
plt.show()

In [ ]:
ax = sns.scatterplot(data = fsdf, x="v2x_egaldem", y="v2x_polyarchy", alpha = 0.01)
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Polyarchy Index')
plt.xlim(0, 1) 
plt.ylim(0, 1) 
plt.show()

#### Bivariate Histograms


In [ ]:
ax = sns.displot(fsdf, x="v2x_egaldem", y="v2x_polyarchy")
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Polyarchy Index')
plt.xlim(0, 1) 
plt.ylim(0, 1) 
plt.show()

In [ ]:
ax = sns.displot(fsdf, x="v2x_egaldem", y="v2x_polyarchy", binwidth = 0.01, rug=True)
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Polyarchy Index')
plt.xlim(0, 1) 
plt.ylim(0, 1) 
plt.show()

#### Bivariate Kernel Density Estimation


In [ ]:
ax = sns.displot(fsdf, x="v2x_egaldem", y="v2x_polyarchy", kind="kde", rug = True, rug_kws = {"alpha": 0.01})
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Polyarchy Index')
plt.show()

#### Line of Best Fit


In [ ]:
ax = sns.regplot(data = fsdf, x = "v2x_egaldem", y = "v2x_polyarchy", color='darkgray', scatter_kws = {"alpha": 0.05}, line_kws={"color": "black"})
sns.despine()
ax.set(xlabel='Egalitarian Democracy Index', ylabel='Polyarchy Index')
plt.show()

### Correlation 


In [ ]:
corr_libdem_partipdem = fsdf.v2x_libdem.corr(fsdf.v2x_partipdem)
corr_libdem_year = fsdf.v2x_libdem.corr(fsdf.year)

print(f'Correlation of v2x_libdem and v2x_partipdem: {corr_libdem_partipdem}')
print(f'Correlation of v2x_libdem and year: {corr_libdem_year}')

In [ ]:
df_new = fsdf.copy()
df_new['year_x100'] = fsdf['year'].apply(lambda x: x*100)

new_corr_libdem_partipdem = df_new.v2x_libdem.corr(df_new.v2x_partipdem)
new_corr_libdem_year = df_new.v2x_libdem.corr(df_new.year_x100)

print(f'Correlation of v2x_libdem and v2x_partipdem: {new_corr_libdem_partipdem}')
print(f'Correlation of v2x_libdem and year*100: {new_corr_libdem_year}')

#### Correlation Coefficient: Pearson and Spearman


### Correlation Matrices and Heatmaps


In [ ]:
fsdf_corr = fsdf[['v2x_polyarchy', 'v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem']].corr()

In [ ]:
ax = sns.heatmap(data = fsdf_corr, vmin = 0.9, vmax = 1, cmap=grayscale_cmap)
plt.show()

In [ ]:
mask = np.triu(np.ones_like(fsdf_corr, dtype = bool))
ax = sns.heatmap(fsdf_corr, mask = mask, vmin = 0.9, 
                 vmax = 1, cmap=grayscale_cmap)
plt.show()

## Visualization with More Informational Density 


### Layering Marginal and Joint Distributions


In [ ]:
ax = sns.jointplot(data=fsdf, x="v2x_polyarchy", y="v2x_egaldem", kind="reg", color='darkgray',
              joint_kws={'line_kws':{'color':'black'},
                         'scatter_kws':{'alpha':0.03}})

### Quick Comparisons with Pair Plots


In [ ]:
high_level_indexes = ['v2x_polyarchy', 'v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem']
ax = sns.pairplot(fsdf[high_level_indexes])
plt.show()

# CONCLUSION
## Key Points 
