In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
# https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html
# each variable is a column
# each observation is a row
# each type of observational unit is a table

### Bar Charts

In [None]:
pokemon = pd.read_csv('../data/pokemon.csv')
print(pokemon.shape)
pokemon.head(10)

In [None]:
# A semicolon (;) at the end of the statement will supress printing the plotting information
sns.countplot(data=pokemon, x='generation_id');
#plt.show()

In [None]:
base_color = sns.color_palette()[0]
sns.countplot(data=pokemon, x='generation_id', color=base_color);
#plt.show();


In [None]:
# Bar Chart using the Matplotlib

In [None]:
x = pokemon['generation_id'].unique()
y = pokemon['generation_id'].value_counts(sort=False)
plt.bar(x, y)
plt.xlabel('generation_id')
plt.ylabel('count')
# plt.show()

In [None]:
sns.countplot(data=pokemon, x='generation_id', color=base_color, order=[5, 1, 3, 4, 2, 7, 6]);
#plt.show()

In [None]:
# Dynamic-ordering the bars
freq = pokemon['generation_id'].value_counts()
gen_order = freq.index
sns.countplot(data=pokemon, x='generation_id', color=base_color, order=gen_order)
plt.show();

In [None]:
#plt.xticks(rotation=90)
sns.countplot(data=pokemon, x='type_1', color=base_color);
plt.xticks(rotation=60);

In [None]:
# Rotate the axes clockwise
type_order = pokemon['type_1'].value_counts().index
sns.countplot(data=pokemon, y='type_1', color=base_color, order=type_order);

### Absolute vs. Relative Frequency

In [None]:
pokemon = pd.read_csv('../data/pokemon.csv')
print(pokemon.shape)
pokemon.head(10)

In [None]:
# pandas.DataFrame.melt()
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.melt.html
pkmn_types = pokemon.melt(id_vars=['id', 'species'],
                          value_vars=['type_1', 'type_2'],
                          var_name='type_level',
                          value_name='type')
pkmn_types.head(10)

In [None]:
type_counts = pkmn_types['type'].value_counts()
type_counts

In [None]:
type_order = type_counts.index
type_order

In [None]:
base_color = sns.color_palette()[0]
sns.countplot(data=pkmn_types, y='type', color=base_color, order=type_order);

In [None]:
n_pokemon = pkmn_types['type'].value_counts().sum()
max_type_count = type_counts.iloc[0]
max_prop = max_type_count / n_pokemon
max_prop

In [None]:
tick_props = np.arange(0, max_prop, 0.02)
tick_props

In [None]:
tick_names = ['{:0.2f}'.format(v) for v in tick_props]
tick_names

In [None]:
sns.countplot(data=pkmn_types, y='type', color=base_color, order=type_order);
plt.xticks(tick_props * n_pokemon, tick_names)  # !!! important
plt.xlabel('proportion');

In [None]:
# https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.text.html
base_color = sns.color_palette()[0]
sns.countplot(data=pkmn_types, y='type', color=base_color, order=type_order);
for i in range(type_counts.shape[0]):
    count = type_counts.iloc[i]
    pct_string = '{:0.1f}'.format(100 * count / n_pokemon)
    plt.text(count + 1, i, pct_string, va='center')

In [None]:
base_color = sns.color_palette()[0]
sns.countplot(data=pkmn_types, x='type', color=base_color, order=type_order);
type_counts = pkmn_types['type'].value_counts()

# get the current tick locations and labels
locs, labels = plt.xticks(rotation=90)

for loc, label in zip(locs, labels):
    count = type_counts[label.get_text()]
    pct_string = '{:0.1f}%'.format(100 * count / n_pokemon)
    plt.text(loc, count + 2, pct_string, ha='center', color='black')

from matplotlib import rcParams

# Specify the figure size in inches, for both X, and Y axes
rcParams['figure.figsize'] = 12, 4

In [None]:
# reset
plt.rcdefaults()

### Counting Missing Data

In [None]:
sales_data = pd.read_csv('../data/sales_data.csv')
sales_data.head(10)

In [None]:
sales_data.shape

In [None]:
sales_data.isnull()

In [None]:
sales_data.isna().sum()

In [None]:
na_counts = sales_data.drop(['Date', 'Temperature', 'Fuel_Price'], axis=1).isna().sum()
na_counts

In [None]:
# https://seaborn.pydata.org/generated/seaborn.barplot.html
sns.barplot(x=na_counts.index.values, y=na_counts)
plt.xticks(rotation=90)
for i in range(na_counts.shape[0]):
    count = na_counts.iloc[i]
    plt.text(i, count + 300, count, ha='center', va='top')

### Pie Charts

In [None]:
# matplotlib.pyplot.pie(x_data, labels, colors, startangle, counterclock, wedgeprops)
# matplotlib.pyplot.axis(*args, emit=True,* *kwargs)
sorted_counts = pokemon['generation_id'].value_counts()
plt.pie(sorted_counts, labels=sorted_counts.index, startangle=90, counterclock=False)
plt.axis('square');

In [None]:
sorted_counts = pokemon['generation_id'].value_counts()
plt.pie(sorted_counts, labels=sorted_counts.index, startangle=90, counterclock=False, wedgeprops={'width': 0.4})
plt.axis('square');

### Histograms

In [None]:
pokemon = pd.read_csv('../data/pokemon.csv')
print(pokemon.shape)
pokemon.head(10)

In [None]:
plt.hist(data=pokemon, x='speed');

In [None]:
pokemon['speed'].describe()

In [None]:
plt.hist(data=pokemon, x='speed', bins=20)

In [None]:
bins = np.arange(0, pokemon['speed'].max() + 5, 5)
plt.hist(data=pokemon, x='speed', bins=bins)

In [None]:
sns.displot(pokemon['speed']);

In [None]:
sns.histplot(pokemon['speed']);

In [None]:
plt.figure(figsize=[20, 5])

plt.subplot(1, 2, 1)  # 1 row, 2 cols, subplot 1
bins = np.arange(0, pokemon['speed'].max() + 4, 4)
plt.hist(data=pokemon, x='speed', bins=bins);

plt.subplot(1, 2, 2)  # 1 row, 2 cols, subplot 2
bins = np.arange(0, pokemon['speed'].max() + 1, 1)
plt.hist(data=pokemon, x='speed', bins=bins);

### Figures, Axes, and Subplots

In [None]:
# Figure -> Axes -> Plot
fig = plt.figure()
# The argument of add_axes represents the dimensions [left, bottom, width, height] of the new axes.
# All quantities are in fractions of figure width and height.
ax = fig.add_axes([.125, .125, .775, .755])
ax.hist(data=pokemon, x='speed');

In [None]:
fig = plt.figure()
ax = fig.add_axes([.125, .125, .775, .755])
base_color = sns.color_palette()[0]
sns.countplot(data=pokemon, x='generation_id', color=base_color, ax=ax);

In [None]:
fig, axes = plt.subplots(3, 4)  # grid of 3x4 subplots
axes = axes.flatten()  # reshape from 3x4 array into 12-element vector
for i in range(12):
    plt.sca(axes[i])  # set the current Axes
    plt.text(0.5, 0.5, i + 1)  # print conventional subplot index number to middle of Axes

### Choosing a Plot for Discrete Data

In [None]:
# Since data points fall on set values (bar-width), it can help to reduce ambiguity by putting bin edges between the actual values taken by the data.

In [None]:
die_rolls = pd.read_csv('../data/die_rolls.csv')
die_rolls.head(10)

In [None]:
plt.figure(figsize=[20, 5])

# Histogram on the left, bin edges on integers
plt.subplot(1, 2, 1)
bin_edges = np.arange(2, 12 + 1.1, 1)  # note `+1.1`, see below
plt.hist(data=die_rolls, x='Sum', bins=bin_edges);
plt.xticks(np.arange(2, 12 + 1, 1));

# Histogram on the right, bin edges between integers
plt.subplot(1, 2, 2)
bin_edges = np.arange(1.5, 12.5 + 1, 1)
plt.hist(data=die_rolls, x='Sum', bins=bin_edges);
plt.xticks(np.arange(2, 12 + 1, 1));

In [None]:
bin_edges = np.arange(1.5, 12.5 + 1, 1)
plt.hist(data=die_rolls, x='Sum', bins=bin_edges, rwidth=0.7)
plt.xticks(np.arange(2, 12 + 1, 1));

### Descriptive Statistics, Outliers and Axis Limits

In [None]:
# Get the ticks for bins between [0-15], at an interval of 0.5
bins = np.arange(0, pokemon['height'].max() + 0.5, 0.5)

plt.hist(data=pokemon, x='height', bins=bins);

In [None]:
bins = np.arange(0, pokemon['height'].max() + 0.2, 0.2)
plt.hist(data=pokemon, x='height', bins=bins)

# Set the upper and lower bounds of the bins that are displayed in the plot
plt.xlim((0, 6));
#plt.xlim(0,6)

### Scales and Transformations

In [None]:
plt.figure(figsize=[20, 5])

plt.subplot(1, 2, 1)
plt.hist(data=pokemon, x='weight');
plt.xlabel('Initial plot with original data')

plt.subplot(1, 2, 2)
## Get the ticks for bins between [0 - maximum weight]
bins = np.arange(0, pokemon['weight'].max() + 40, 40)
plt.hist(data=pokemon, x='weight', bins=bins);

## The argument in the xscale() represents the axis scale type to apply.
## The possible values are: {"linear", "log", "symlog", "logit", ...}
## Refer - https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.xscale.html
plt.xscale('log')
plt.xlabel('The x-axis limits NOT are changed. They are only scaled to log-type')

In [None]:
pokemon['weight'].describe()

In [None]:
## Transform the describe() to a scale of log10
np.log10(pokemon['weight'].describe())

In [None]:
## Axis transformation
## Bin size
bins = 10 ** np.arange(-1, 3 + 0.1, 0.1)
plt.hist(data=pokemon, x='weight', bins=bins)

## The argument in the xscale() represents the axis scale type to apply.
## The possible values are: {"linear", "log", "symlog", "logit", ...}
plt.xscale('log')
plt.xlabel('x-axis limits are changed, and scaled to log-type');

In [None]:
## Get the ticks for bins between [0 - maximum weight]
bins = 10 ** np.arange(-1, 3 + 0.1, 0.1)
# print(bins)

ticks = [0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000]
labels = ['{}'.format(v) for v in ticks]

plt.hist(data=pokemon, x='weight', bins=bins)
plt.xscale('log')
plt.xticks(ticks, labels);

# Setting ticks in cycles of 1-3-10 or 1-2-5-10 are very useful for base-10 log transforms.
# It is important that the xticks are specified after xscale since that function has its own built-in tick settings

In [None]:
def sqrt_trans(x, inverse = False):
    """ transformation helper function """
    if not inverse:
        return np.sqrt(x)
    else:
        return x ** 2

## Bin resizing, to transform the x-axis
bin_edges = np.arange(0, sqrt_trans(pokemon['weight'].max())+1, 1)

## Plot the scaled data
plt.hist(pokemon['weight'].apply(sqrt_trans), bins = bin_edges)

## Identify the tick-locations
tick_locs = np.arange(0, sqrt_trans(pokemon['weight'].max())+10, 10)

## Apply x-ticks
plt.xticks(tick_locs, sqrt_trans(tick_locs, inverse = True).astype(int));

In [None]:
# Waffle Plots

# One alternative univariate plot type that you might see for categorical data is the waffle plot, also known as the square pie chart. 
# While the standard pie chart uses a circle to represent the whole, a waffle plot is plotted onto a square divided into a 10x10 grid. 
# Each small square in the grid represents one percent of the data, and a number of squares are colored by category to indicate total proportions. 

In [None]:
# There's no built-in function for waffle plots in Matplotlib or Seaborn, so we'll need to take some additional steps in order to build one with the tools available

def percentage_blocks(df, var):
    """
    Take as input a dataframe and variable, and return a Pandas series with
    approximate percentage values for filling out a waffle plot.
    """
    # compute base quotas
    percentages = 100 * df[var].value_counts() / df.shape[0]
    counts = np.floor(percentages).astype(int) # integer part = minimum quota
    decimal = (percentages - counts).sort_values(ascending = False)
    
    # add in additional counts to reach 100
    rem = 100 - counts.sum()
    for cat in decimal.index[:rem]:
        counts[cat] += 1
    
    return counts

waffle_counts = percentage_blocks(df, 'cat_var')

prev_count = 0
# for each category,
for cat in range(waffle_counts.shape[0]):
    # get the block indices
    blocks = np.arange(prev_count, prev_count + waffle_counts[cat])
    # and put a block at each index's location
    x = blocks % 10 # use mod operation to get ones digit
    y = blocks // 10 # use floor division to get tens digit
    plt.bar(x = x, height = 0.8, width = 0.8, bottom = y)
    prev_count += waffle_counts[cat]

# aesthetic wrangling
plt.legend(waffle_counts.index, bbox_to_anchor = (1, 0.5), loc = 6)
plt.axis('off')
plt.axis('square')

In [None]:
# Other variants of the waffle plot exist to extend it beyond just displaying probabilities. 
# By associating each square with an amount rather than a percentage, we can use waffle plots to show absolute frequencies instead. 
# This might cause us to end up with something other than 100 squares.

In [None]:
# each box represents five full counts
waffle_counts = (df['cat_var'].value_counts() / 5).astype(int)

prev_count = 0
# for each category,
for cat in range(waffle_counts.shape[0]):
    # get the block indices
    blocks = np.arange(prev_count, prev_count + waffle_counts[cat])
    # and put a block at each index's location
    x = blocks % 10
    y = blocks // 10
    plt.bar(y, 0.8, 0.8, x)
    prev_count += waffle_counts[cat]

# box size legend
plt.bar(7.5, 0.8, 0.8, 2, color = 'white', edgecolor = 'black', lw = 2)
plt.text(8.1, 2.4,'= 5 data points', va = 'center')

# aesthetic wrangling
plt.legend(waffle_counts.index, bbox_to_anchor = (0.8, 0.5), loc = 6)
plt.axis('off')
plt.axis('square')

In [None]:
# https://github.com/gyli/PyWaffle