# Plotting and Visualization

In [1]:
%matplotlib notebook

## A Brief matplotlib API primer

In [2]:
import matplotlib.pyplot as plt

In [3]:
import numpy as np

In [4]:
data = np.arange(10)

In [5]:
data

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [6]:
plt.plot(data)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x190f276c898>]

While libraries like seaborn and pandas's built-in plotting functions will deal with many of the mundane details of making plots, should we wish to customize the beyond the function opitions provided, you will need to learn a bit about matplotlib API.

### Figures and Subplots

Plots in matplotlib reside within a __Figure__ object. You can create a new figure with plt.figure:

In [7]:
fig = plt.figure()
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
plt.plot(np.random.randn(50).cumsum(), 'k--')
ax1.hist(np.random.randn(100), bins=20, color = 'k', alpha = 0.3)
ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x190f2808940>

In [8]:
fig, axes = plt.subplots(2,3)
axes[0,1].hist(np.random.randn(100), bins=20, color = 'k', alpha = 0.3)

<IPython.core.display.Javascript object>

(array([ 1.,  0.,  0.,  0.,  2.,  1.,  1.,  2.,  3.,  6.,  5.,  7.,  9.,
        15., 14.,  8., 12.,  4.,  7.,  3.]),
 array([-3.93574431, -3.63957768, -3.34341106, -3.04724443, -2.75107781,
        -2.45491118, -2.15874455, -1.86257793, -1.5664113 , -1.27024467,
        -0.97407805, -0.67791142, -0.38174479, -0.08557817,  0.21058846,
         0.50675509,  0.80292171,  1.09908834,  1.39525497,  1.69142159,
         1.98758822]),
 <a list of 20 Patch objects>)

In [9]:
axes

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000190F278AB00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000190F2876080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000190F289C5F8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000190F28C2B70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000190F28F6128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000190F291D6A0>]],
      dtype=object)

Here the axes can be easily indexed like a two-dimensional array; for example, axes[0,1].

#### Adjusting the spacing around subplots

In [10]:
fig, axes = plt.subplots(2,2, sharex=True, sharey=True)
for i in range(2):
    for j in range(2):
        axes[i,j].hist(np.random.randn(500), bins = 50, color='k', alpha = 0.5)
plt.subplots_adjust(wspace=0, hspace=0)

<IPython.core.display.Javascript object>

### Colors, Markers, and Line Styles

In [11]:
# plt.plot? # call the help for the plot function

In [12]:
from numpy.random import randn
fig2, ax2 = plt.subplots()
ax2.plot(randn(30).cumsum(), 'ko--')
# OR long: ax2.plot(randn(30).cumsum(), color = 'k', linestyle = 'dashed', marker = 'o')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x190f29784e0>]

In [13]:
data = np.random.randn(30).cumsum()
fig3, ax3 = plt.subplots()
ax3.plot(data, 'k--', label='Default')
ax3.plot(data, 'k--', drawstyle = 'steps-post', label = 'steps-post')
plt.legend(loc = 'best')

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x190f2c47470>

### Ticks, Labels, and Legends

#### Setting the title, axis labels, ticks, and ticklabels

In [14]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(np.random.randn(1000).cumsum())

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x190f2c54358>]

In [15]:
ticks = ax.set_xticks([0,250,500,750,1000])

In [16]:
labels = ax.set_xticklabels(['one', 'two','three','four','five'], rotation=30, fontsize='small')

In [17]:
ax.set_title('My first matplotlib plot')

Text(0.5, 1.0, 'My first matplotlib plot')

In [18]:
ax.set_xlabel('Stages')

Text(0.5, 0, 'Stages')

#### Adding legends

In [19]:
from numpy.random import randn
fig = plt.figure(); ax = fig.add_subplot(1,1,1)
ax.plot(randn(1000).cumsum(),'k',label='one')
ax.plot(randn(1000).cumsum(),'k--',label='two')
ax.plot(randn(1000).cumsum(),'k.',label='three')
ax.legend(loc='best') # loc tells matplotlib where to place the legend plot

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x190f2cc1c50>

### Annotations and Drawing on a Subplot

In addition to the standard plot types, you may wish to draw your own plot annotations, which could consist of text, arrows, or other shapes. You can add annotations and text using the text, arrow, and annotate functions. text draws text at given coordniates (x,y) on the plot with optional custom styling.
Annotations can draw both text and arrows arranged appropriately. 

In [20]:
from datetime import datetime
import pandas as pd

fig = plt.figure()
ax = fig.add_subplot(1,1,1)

data = pd.read_csv('../examples/spx.csv', index_col = 0, parse_dates = True)
spx = data['SPX']

spx.plot(ax=ax, style = 'k-')

crisis_data = [
    (datetime(2007,10,11), 'Peak of bull market'),
    (datetime(2008,3,12), 'Bear Stearns Fails'),
    (datetime(2008,9,15), 'Lehman Bankruptcy')
]

for date, label in crisis_data:
    ax.annotate(label, xy=(date,spx.asof(date) + 75),
               xytext=(date,spx.asof(date) + 225),
               arrowprops = dict(facecolor='black', headwidth=4, width = 2,
                                headlength=4),
               horizontalalignment = 'left', verticalalignment = 'top')

# Zoom in on 2007-2010

ax.set_xlim('1/1/2007', '1/1/2011')
ax.set_ylim([600, 1800])

ax.set_title('Important dates in the 2008-2009 financial crisis')

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Important dates in the 2008-2009 financial crisis')

Drawing shapes requires some more care. matplotlib has objects that represent many common shapes, referred to as _patches_. Some of these, like Rectangle and Circle, are found in the matplotlib.pyplot, but the full set is located in matplotlib.patches.
To add a shape to a plot, you create the patch object shp and add it to a subplot by calling ax.add_patch(shp):

In [21]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

rect = plt.Rectangle((0.2,0.75),0.4,0.15,color='k',alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b',alpha=0.3)
pgon = plt.Polygon([[0.15,0.15],[0.35,0.4],[0.2,0.6]],color='g',alpha=0.5)

ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)

<IPython.core.display.Javascript object>

<matplotlib.patches.Polygon at 0x190f3a7d748>

### Saving Plots to Files

You can save the active figure to file using plt.savefig. This method is equivalent to the figure object's savefig instance method. 
To get a plot as PNG with minimal whitespace around the plot and at 400 DPI (dots-per-inch), you would do:

In [22]:
plt.savefig('figpath.png', dpi=400, bbox_inches='tight')

### matplotlib Configuration

matplotlib comes configured with color schemes and defaults that are geared primaryly toward preparing figures for publication. One way to modify the configuration programmatically from Python is to use the rc method; for example, to set the global default figure size to be 10x10, you could enter:

In [23]:
# plt.rc('figure', figsize=(10,10))

As we will see in the next section, the seaborn package has several built-in plot themes or styles that use matplotlib's configuration system internally.

## Plotting with pandas and seaborn

matplotlib can be a fairly low-level tool. You assemble a plot from its base components: the data display (i.e., the type of plot: line, bar, box, scatter, contour, etc.), legend, title. ticks, tick labels, and other annotations.
In __pandas__ we may have multiple columns of data, along with row and column labels, pandas itself has built-in methods that simplify creating visualizations from __DataFrame__ and __Series__ objects. Another library is __seaborn__, a statistical graphics library created by Michael Waskorn. Seaborn simplifies creating many common visualization types.

Importing seaborn modifies the default matplotlib color schemes and plot styles to improve readability and aestatics. Even if you do not use the seaborn API, you may prefer to import seaborn as a simple eay to improve the visual aestatics of general matplotlib plots.

### Line Plots

In [24]:
import numpy as np
import pandas as pd
fig, axes = plt.subplots(1,1) # this line is not indicate in the book, but seems to be necessary 
s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0,100,10))
s.plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190f3ab2390>

DataFrame's plot method plots each of its columns as a different line on the same subplot, creating a legend automatically:

In [25]:
import numpy as np
import pandas as pd
# fig, axes = plt.subplots(1,1)
df = pd.DataFrame(np.random.randn(10,4).cumsum(0),
               columns=['A','B','C','D'],
               index = np.arange(0,100,10))
df.plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190f3afc3c8>

### Bar Plots

In [26]:
import pandas as pf
import numpy as np
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2,1) # has to be inserted when plotting Series

data = pd.Series(np.random.rand(16),index=list('abcdefghijklmnop'))
data.plot.bar(ax=axes[0],color='k',alpha=0.7)
data.plot.barh(ax=axes[1],color='k',alpha=0.7)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190f3b9f668>

With a DataFrame, bar plots group the values in each row together in a group in bars, side by side, for each value.

In [27]:
import pandas as pf
import numpy as np
import matplotlib.pyplot as plt

df = pd.DataFrame(np.random.rand(6,4),
                  index=['one','two','three','four','five','six'],
                  columns=pd.Index(['A','B','C','D'], name='Genus'))
df

Genus,A,B,C,D
one,0.536765,0.280983,0.424908,0.752377
two,0.868054,0.367954,0.420248,0.546151
three,0.729959,0.093607,0.343811,0.353122
four,0.400521,0.652988,0.254777,0.192376
five,0.7286,0.732351,0.06474,0.954074
six,0.05179,0.736037,0.095405,0.398889


In [28]:
df.plot.bar()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190f2ca19b0>

In [29]:
df.plot.barh(stacked=True, alpha=0.5)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190f3cf8c18>

New example: Tipping dataset - we want to make a stacked bar plot showing the percentage of data points for each party size on each day:

In [43]:
tips = pd.read_csv('../examples/tips.csv')
party_counts = pd.crosstab(tips['day'], tips['size'])
party_counts

size,1,2,3,4,5,6
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fri,1,16,1,1,0,0
Sat,2,53,18,13,1,0
Sun,0,39,15,18,3,1
Thur,1,48,4,5,1,3


In [44]:
# Not many 1- and 6-person parties
party_counts = party_counts.loc[:,2:5]

In [45]:
# Normalize to sum to 1
party_pcts = party_counts.div(party_counts.sum(1), axis = 0)
party_pcts

size,2,3,4,5
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,0.888889,0.055556,0.055556,0.0
Sat,0.623529,0.211765,0.152941,0.011765
Sun,0.52,0.2,0.24,0.04
Thur,0.827586,0.068966,0.086207,0.017241


In [46]:
party_pcts.plot.bar()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190f6303a58>

In [47]:
plt.close('all')

With data that requires aggregation and summarization before making a plot, using the __seaborn__ package can make things much simpler. Let's look now at the tipping percentage by day with seaborn:

In [48]:
import seaborn as sns
tips['tip_pct']=tips['tip']/(tips['total_bill']- tips['tip'])
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.063204
1,10.34,1.66,No,Sun,Dinner,3,0.191244
2,21.01,3.5,No,Sun,Dinner,3,0.199886
3,23.68,3.31,No,Sun,Dinner,2,0.162494
4,24.59,3.61,No,Sun,Dinner,4,0.172069


In [49]:
sns.barplot(x='tip_pct', y='day', data=tips, orient='h')

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190f63827f0>

In [50]:
plt.close('all')

In [55]:
sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h')

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190f64cc828>

In [59]:
plt.close('all') # Always close the plot - Otherwise the new plot will be plotted in the old figure

You can switch between different plot appearances using __seaborn.set__

In [60]:
sns.set(style="whitegrid")

### Histograms and Density Plots

In [61]:
tips['tip_pct'].plot.hist(bins=50)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190fb272d68>

In [62]:
plt.close('all') 

In [64]:
tips['tip_pct'].plot.density()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x190fca1aeb8>

### Scatter or Point Plots 

### Facet Grids and Categorical Data

## Other Python Visualization Tools