Notebook hyperlinks:

* [Week 2](#Basic-Plotting-with-matplotlib)
* [Week 3](#Subplots)
* [Week 4](#Pandas-Visualization)

Texts in different colours

<font color=blue>Text</font>
<font color=red>Text</font>
<font color=green>Text</font>
<font color=pink>Text</font>
<font color=yellow>Text</font>
<font color=orange>Text</font>
<font color=purple>Text</font>

# Basic Plotting with matplotlib

You can show matplotlib figures directly in the notebook by using the `%matplotlib notebook` and `%matplotlib inline` magic commands. 

`%matplotlib notebook` provides an interactive environment.
`%matplotlib inline` makes a new graph each time. 

In [61]:
%matplotlib notebook

In [2]:
import matplotlib as mpl
mpl.get_backend()

'nbAgg'

In [62]:
import matplotlib.pyplot as plt
plt.plot?

In [5]:
# because the default is the line style '-', 
# nothing will be shown if we only pass in one point (3,2)
plt.plot(3, 2)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x1bd73df4588>]

In [6]:
# we can pass in '.' to plt.plot to indicate that we want
# the point (3,2) to be indicated with a marker '.'
plt.plot(3, 2, '.')

[<matplotlib.lines.Line2D at 0x1bd73eb9ac8>]

Let's see how to make a plot without using the scripting layer.

In [7]:
# First let's set the backend without using mpl.use() from the scripting layer
from matplotlib.backends.backend_agg import FigureCanvasAgg
from matplotlib.figure import Figure

# create a new figure
fig = Figure()

# associate fig with the backend
canvas = FigureCanvasAgg(fig)

# add a subplot to the fig
ax = fig.add_subplot(111)

# plot the point (3,2)
ax.plot(3, 2, '.')

# save the figure to test.png
# you can see this figure in your Jupyter workspace afterwards by going to
# https://hub.coursera-notebooks.org/
canvas.print_png('test.png')

We can use html cell magic to display the image.

In [8]:
%%html
<img src='test.png' />

In [9]:
# create a new figure
plt.figure()

# plot the point (3,2) using the circle marker
plt.plot(3, 2, 'o')

# get the current axes
ax = plt.gca()

# Set axis properties [xmin, xmax, ymin, ymax]
ax.axis([0,6,0,10])

<IPython.core.display.Javascript object>

[0, 6, 0, 10]

In [10]:
# create a new figure
plt.figure()

# plot the point (1.5, 1.5) using the circle marker
plt.plot(1.5, 1.5, 'o')
# plot the point (2, 2) using the circle marker
plt.plot(2, 2, 'o')
# plot the point (2.5, 2.5) using the circle marker
plt.plot(2.5, 2.5, 'o')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x1bd74311b70>]

In [11]:
# get current axes; axes contains all the plot elements
ax = plt.gca()
# get all the child objects the axes contains
ax.get_children()

[<matplotlib.lines.Line2D at 0x1bd743117f0>,
 <matplotlib.lines.Line2D at 0x1bd7435d518>,
 <matplotlib.lines.Line2D at 0x1bd74311b70>,
 <matplotlib.spines.Spine at 0x1bd7433f780>,
 <matplotlib.spines.Spine at 0x1bd7433f358>,
 <matplotlib.spines.Spine at 0x1bd7433f400>,
 <matplotlib.spines.Spine at 0x1bd7433f080>,
 <matplotlib.axis.XAxis at 0x1bd7433f320>,
 <matplotlib.axis.YAxis at 0x1bd7433ffd0>,
 Text(0.5, 1.0, ''),
 Text(0.0, 1.0, ''),
 Text(1.0, 1.0, ''),
 <matplotlib.patches.Rectangle at 0x1bd74341400>]

# Scatterplots

In [12]:
import numpy as np

x = np.array([1,2,3,4,5,6,7,8])
y = x

plt.figure()
plt.scatter(x, y) # similar to plt.plot(x, y, '.'), but the underlying child objects in the axes are not Line2D

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1bd73d58828>

In [26]:
# gets the current children; the axes elements don't contain Line2D but contain a collection of points called Collections
plt.gca().get_children()


[<matplotlib.collections.PathCollection at 0x1bd75770390>,
 <matplotlib.collections.PathCollection at 0x1bd75770710>,
 <matplotlib.spines.Spine at 0x1bd7574d9b0>,
 <matplotlib.spines.Spine at 0x1bd7574dac8>,
 <matplotlib.spines.Spine at 0x1bd7574dbe0>,
 <matplotlib.spines.Spine at 0x1bd7574dcf8>,
 <matplotlib.axis.XAxis at 0x1bd7574d940>,
 <matplotlib.axis.YAxis at 0x1bd75755128>,
 Text(0.5, 1.0, 'Relationship between ball kicking and grades'),
 Text(0.0, 1.0, ''),
 Text(1.0, 1.0, ''),
 <matplotlib.legend.Legend at 0x1bd757a81d0>,
 <matplotlib.patches.Rectangle at 0x1bd7575ad68>]

In [18]:
import numpy as np

x = np.array([1,2,3,4,5,6,7,8])
y = x

# create a list of colors for each point to have
# ['green', 'green', 'green', 'green', 'green', 'green', 'green', 'red']
colors = ['green']*(len(x)-1) # each of the points except for the last one is green
colors.append('red')

plt.figure()

# plot the point with size 100 and chosen colors
plt.scatter(x, y, s=100, c=colors)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1bd74276278>

In [16]:
# convert the two lists into a list of pairwise tuples
zip_generator = zip([1,2,3,4,5], [6,7,8,9,10])

print(list(zip_generator))
# the above prints:
# [(1, 6), (2, 7), (3, 8), (4, 9), (5, 10)]

zip_generator = zip([1,2,3,4,5], [6,7,8,9,10])
# The single star * unpacks a collection into positional arguments, i.e. it removes the elements from the list and shows 
# them as tuples with each tuple being a positional argument
print(*zip_generator)
# the above prints:
# (1, 6) (2, 7) (3, 8) (4, 9) (5, 10)

[(1, 6), (2, 7), (3, 8), (4, 9), (5, 10)]
(1, 6) (2, 7) (3, 8) (4, 9) (5, 10)


In [17]:
# use zip to convert 5 tuples with 2 elements each to 2 tuples with 5 elements each
print(list(zip((1, 6), (2, 7), (3, 8), (4, 9), (5, 10))))
# the above prints:
# [(1, 2, 3, 4, 5), (6, 7, 8, 9, 10)]


zip_generator = zip([1,2,3,4,5], [6,7,8,9,10])
# let's turn the data back into 2 lists
x, y = zip(*zip_generator) # This is like calling zip((1, 6), (2, 7), (3, 8), (4, 9), (5, 10))
print(x)
print(y)
# the above prints:
# (1, 2, 3, 4, 5)
# (6, 7, 8, 9, 10)

[(1, 2, 3, 4, 5), (6, 7, 8, 9, 10)]
(1, 2, 3, 4, 5)
(6, 7, 8, 9, 10)


In [21]:
plt.figure()
# plot a data series 'Tall students' in red using the first two elements of x and y
plt.scatter(x[:2], y[:2], s=100, c='red', label='Tall students')
# plot a second data series 'Short students' in blue using the last three elements of x and y 
plt.scatter(x[2:], y[2:], s=100, c='blue', label='Short students')

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1bd75770710>

In [22]:
# add a label to the x axis
plt.xlabel('The number of times the child kicked a ball')
# add a label to the y axis
plt.ylabel('The grade of the student')
# add a title
plt.title('Relationship between ball kicking and grades')

Text(0.5, 1.0, 'Relationship between ball kicking and grades')

In [23]:
# add a legend (uses the labels from plt.scatter)
plt.legend()

<matplotlib.legend.Legend at 0x1bd757a8f28>

In [24]:
# add the legend to loc=4 (the lower right hand corner), also gets rid of the frame and adds a title
plt.legend(loc=4, frameon=False, title='Legend')

<matplotlib.legend.Legend at 0x1bd757a81d0>

In [25]:
# get children from current axes (the legend is the second to last item in this list)
plt.gca().get_children()

[<matplotlib.collections.PathCollection at 0x1bd75770390>,
 <matplotlib.collections.PathCollection at 0x1bd75770710>,
 <matplotlib.spines.Spine at 0x1bd7574d9b0>,
 <matplotlib.spines.Spine at 0x1bd7574dac8>,
 <matplotlib.spines.Spine at 0x1bd7574dbe0>,
 <matplotlib.spines.Spine at 0x1bd7574dcf8>,
 <matplotlib.axis.XAxis at 0x1bd7574d940>,
 <matplotlib.axis.YAxis at 0x1bd75755128>,
 Text(0.5, 1.0, 'Relationship between ball kicking and grades'),
 Text(0.0, 1.0, ''),
 Text(1.0, 1.0, ''),
 <matplotlib.legend.Legend at 0x1bd757a81d0>,
 <matplotlib.patches.Rectangle at 0x1bd7575ad68>]

In [27]:
# get the legend from the current axes
legend = plt.gca().get_children()[-2]

In [38]:
# you can use get_children to navigate through the child artists
legend.get_children()[0].get_children()[1].get_children()[0].get_children()

[<matplotlib.offsetbox.HPacker at 0x1bd7579fb00>,
 <matplotlib.offsetbox.HPacker at 0x1bd7579fac8>]

In [39]:
# import the artist class from matplotlib
from matplotlib.artist import Artist

def rec_gc(art, depth=0):
    if isinstance(art, Artist):
        # increase the depth for pretty printing
        print("  " * depth + str(art))
        for child in art.get_children():
            rec_gc(child, depth+2)

# Call this function on the legend artist to see what the legend is made up of
rec_gc(plt.legend())

Legend
    <matplotlib.offsetbox.VPacker object at 0x000001BD75C04CC0>
        <matplotlib.offsetbox.TextArea object at 0x000001BD75C04B70>
            Text(0, 0, '')
        <matplotlib.offsetbox.HPacker object at 0x000001BD75C04B38>
            <matplotlib.offsetbox.VPacker object at 0x000001BD75C04550>
                <matplotlib.offsetbox.HPacker object at 0x000001BD75C04AC8>
                    <matplotlib.offsetbox.DrawingArea object at 0x000001BD75C04748>
                        <matplotlib.collections.PathCollection object at 0x000001BD75C047F0>
                    <matplotlib.offsetbox.TextArea object at 0x000001BD75C04588>
                        Text(0, 0, 'Tall students')
                <matplotlib.offsetbox.HPacker object at 0x000001BD75C04B00>
                    <matplotlib.offsetbox.DrawingArea object at 0x000001BD75C049B0>
                        <matplotlib.collections.PathCollection object at 0x000001BD75C04A58>
                    <matplotlib.offsetbox.TextArea obj

# Line Plots

In [4]:
import numpy as np

linear_data = np.array([1,2,3,4,5,6,7,8])
exponential_data = linear_data**2

plt.figure()
# plot the linear data and the exponential data
plt.plot(linear_data, '-o', exponential_data, '-o')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x1433e1a5c88>,
 <matplotlib.lines.Line2D at 0x1433e1a5e10>]

In [41]:
# plot another series with a dashed red line
plt.plot([22,44,55], '--r')

[<matplotlib.lines.Line2D at 0x1bd75ce0710>]

In [42]:
plt.xlabel('Some data')
plt.ylabel('Some other data')
plt.title('A title')
# add a legend with legend entries (because we didn't have labels when we plotted the data series)
plt.legend(['Baseline', 'Competition', 'Us'])

<matplotlib.legend.Legend at 0x1bd75cf90f0>

In [43]:
# fill the area between the linear data and exponential data
plt.gca().fill_between(range(len(linear_data)), 
                       linear_data, exponential_data, 
                       facecolor='blue', 
                       alpha=0.25)

<matplotlib.collections.PolyCollection at 0x1bd75d12438>

Let's try working with dates!

In [50]:
plt.figure()

observation_dates = np.arange('2017-01-01', '2017-01-09', dtype='datetime64[D]')

plt.plot(observation_dates, linear_data, '-o',  observation_dates, exponential_data, '-o')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x1bd75e04ac8>,
 <matplotlib.lines.Line2D at 0x1bd75e0c390>]

Let's try using pandas

In [51]:
import pandas as pd

plt.figure()
observation_dates = np.arange('2017-01-01', '2017-01-09', dtype='datetime64[D]')
observation_dates = map(pd.to_datetime, observation_dates) # trying to plot a date will result in an error; using the map 
# function iterates over each of the observation dates and converts them to datetime objects so that matplotlib can 
# display them as dates; **Note** in the earlier version of matplotlib if you just provided a set of dates the matplotlib
# wouldn't automatically recognise these dates however in the latest version it does. 
plt.plot(observation_dates, linear_data, '-o',  observation_dates, exponential_data, '-o')

<IPython.core.display.Javascript object>

RuntimeError: matplotlib does not support generators as input

In [56]:
plt.figure()
observation_dates = np.arange('2017-01-01', '2017-01-09', dtype='datetime64[D]')
observation_dates = list(map(pd.to_datetime, observation_dates)) # convert the map to a list to get rid of the error
plt.plot(observation_dates, linear_data, '-o',  observation_dates, exponential_data, '-o')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x1bd7880d4e0>,
 <matplotlib.lines.Line2D at 0x1bd7684a630>]

In [61]:
rec_gc(plt.gca().xaxis)

XAxis(80.000000,120.000000)
    Text(0.5, 38.16579644303842, 'Date')
    Text(1, 39.55468533192731, '')
    <matplotlib.axis.XTick object at 0x000001BD788FEB00>
        Line2D((736330,0))
        Line2D()
        Line2D((0,0),(0,1))
        Text(736330.0, 0, '2017-01-01')
        Text(0, 1, '2017-01-01')
    <matplotlib.axis.XTick object at 0x000001BD7880C240>
        Line2D((736331,0))
        Line2D()
        Line2D((0,0),(0,1))
        Text(736331.0, 0, '2017-01-02')
        Text(0, 1, '2017-01-02')
    <matplotlib.axis.XTick object at 0x000001BD75CCFC18>
        Line2D((736332,0))
        Line2D()
        Line2D((0,0),(0,1))
        Text(736332.0, 0, '2017-01-03')
        Text(0, 1, '2017-01-03')
    <matplotlib.axis.XTick object at 0x000001BD7684AC18>
        Line2D((736333,0))
        Line2D()
        Line2D((0,0),(0,1))
        Text(736333.0, 0, '2017-01-04')
        Text(0, 1, '2017-01-04')
    <matplotlib.axis.XTick object at 0x000001BD76858160>
        Line2D((736334,0))
    

In [57]:
x = plt.gca().xaxis

# rotate the tick labels for the x axis
for item in x.get_ticklabels():
    item.set_rotation(45)

In [58]:
# adjust the subplot so the text doesn't run off the image
plt.subplots_adjust(bottom=0.25)

In [59]:
ax = plt.gca()
ax.set_xlabel('Date')
ax.set_ylabel('Units')
ax.set_title('Exponential vs. Linear performance')

Text(0.5, 1.0, 'Exponential vs. Linear performance')

In [60]:
# you can add mathematical expressions in any text element
ax.set_title("Exponential ($x^2$) vs. Linear ($x$) performance")

Text(0.5, 1.0, 'Exponential ($x^2$) vs. Linear ($x$) performance')

# Bar Charts

In [5]:
plt.figure()
xvals = range(len(linear_data))
plt.bar(xvals, linear_data, width = 0.3)

<IPython.core.display.Javascript object>

<BarContainer object of 8 artists>

In [6]:
new_xvals = []

# plot another set of bars, adjusting the new xvals to make up for the first set of bars plotted
for item in xvals:
    new_xvals.append(item+0.3)

plt.bar(new_xvals, exponential_data, width = 0.3 ,color='red')

<BarContainer object of 8 artists>

In [7]:
from random import randint
linear_err = [randint(0,15) for x in range(len(linear_data))] 

# This will plot a new set of bars with errorbars using the list of random error values
plt.bar(xvals, linear_data, width = 0.3, yerr=linear_err)

<BarContainer object of 8 artists>

In [8]:
# stacked bar charts are also possible
plt.figure()
xvals = range(len(linear_data))
plt.bar(xvals, linear_data, width = 0.3, color='b')
plt.bar(xvals, exponential_data, width = 0.3, bottom=linear_data, color='r')

<IPython.core.display.Javascript object>

<BarContainer object of 8 artists>

In [9]:
# or use barh for horizontal bar charts
plt.figure()
xvals = range(len(linear_data))
plt.barh(xvals, linear_data, height = 0.3, color='b')
plt.barh(xvals, exponential_data, height = 0.3, left=linear_data, color='r')

<IPython.core.display.Javascript object>

<BarContainer object of 8 artists>

# Assignment 2

Before working on this assignment please read these instructions fully. In the submission area, you will notice that you can click the link to **Preview the Grading** for each step of the assignment. This is the criteria that will be used for peer grading. Please familiarize yourself with the criteria before beginning the assignment.

An NOAA dataset has been stored in the file `data/C2A2_data/BinnedCsvs_d400/fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv`. The data for this assignment comes from a subset of The National Centers for Environmental Information (NCEI) [Daily Global Historical Climatology Network](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt) (GHCN-Daily). The GHCN-Daily is comprised of daily climate records from thousands of land surface stations across the globe.

Each row in the assignment datafile corresponds to a single observation.

The following variables are provided to you:

* **id** : station identification code
* **date** : date in YYYY-MM-DD format (e.g. 2012-01-24 = January 24, 2012)
* **element** : indicator of element type
    * TMAX : Maximum temperature (tenths of degrees C)
    * TMIN : Minimum temperature (tenths of degrees C)
* **value** : data value for element (tenths of degrees C)

For this assignment, you must:

1. Read the documentation and familiarize yourself with the dataset, then write some python code which returns a line graph of the record high and record low temperatures by day of the year over the period 2005-2014. The area between the record high and record low temperatures for each day should be shaded.
2. Overlay a scatter of the 2015 data for any points (highs and lows) for which the ten year record (2005-2014) record high or record low was broken in 2015.
3. Watch out for leap days (i.e. February 29th), it is reasonable to remove these points from the dataset for the purpose of this visualization.
4. Make the visual nice! Leverage principles from the first module in this course when developing your solution. Consider issues such as legends, labels, and chart junk.

The data you have been given is near **Ann Arbor, Michigan, United States**, and the stations the data comes from are shown on the map below.

In [33]:
import matplotlib.pyplot as plt, pandas as pd, numpy as np, mplleaflet


def leaflet_plot_stations(binsize, hashid):
    
    
    df = pd.read_csv(r'D:\Coursera\Applied Data Science with Python\Applied Plotting, Charting & Data Representation/BinSize_d{}.csv'.format(binsize))

    #df = pd.read_csv('data/C2A2_data/BinSize_d{}.csv'.format(binsize)) # the {} is used to input the file name 

    station_locations_by_hash = df[df['hash'] == hashid]

    lons = station_locations_by_hash['LONGITUDE'].tolist()
    lats = station_locations_by_hash['LATITUDE'].tolist()

    plt.figure(figsize=(8,8))

    plt.scatter(lons, lats, c='r', alpha=0.7, s=200)

    return mplleaflet.display()

leaflet_plot_stations(400,'fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89')



In [35]:
%matplotlib notebook
df = pd.read_csv(r'D:\Coursera\Applied Data Science with Python\Applied Plotting, Charting & Data Representation/fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv')
df_copy = df.copy()
df_copy = df_copy[~(df_copy['Date'].str.endswith(r'02-29'))] # ~ returns the values that don't end with '02-29'
df_copy['Date'] = list(map(pd.to_datetime, df_copy['Date']))
#df_copy = df_copy.loc[df_copy['Date'].dt.date!=('2008/02/29' or '2012/02/29')]
df_copy = df_copy.loc[df_copy['Date'].dt.year!=2015]
#df_copy.head(10)

max_data = df_copy[df_copy['Element']=='TMAX']
min_data = df_copy[df_copy['Element']=='TMIN']

min_values = min_data.groupby([df_copy['Date'].dt.month,df_copy['Date'].dt.day])['Data_Value'].agg(min)
max_values = max_data.groupby([df_copy['Date'].dt.month,df_copy['Date'].dt.day])['Data_Value'].agg(max)

df['Date'] = list(map(pd.to_datetime, df['Date']))
df_2015 = df.loc[df['Date'].dt.year == 2015]
df_2015['Date'] = list(map(pd.to_datetime, df_2015['Date']))
df_2015_max = df_2015.loc[df_2015['Element']=='TMAX']
df_2015_min = df_2015.loc[df_2015['Element']=='TMIN']
df_2015_max_values = df_2015_max.groupby([df_2015['Date'].dt.month,df_2015['Date'].dt.day])['Data_Value'].agg(max)
df_2015_min_values = df_2015_min.groupby([df_2015['Date'].dt.month,df_2015['Date'].dt.day])['Data_Value'].agg(min)

record_min = [df_2015_min_values.values[k] 
              for k in range(len(min_values.values)) if ((min_values.values[k]-df_2015_min_values.values[k])>0)]
record_min_axis = [k
              for k in range(len(min_values.values)) if ((min_values.values[k]-df_2015_min_values.values[k])>0)]

record_max = [df_2015_max_values.values[k] 
              for k in range(len(min_values.values)) if ((max_values.values[k]-df_2015_max_values.values[k])<0)]
record_max_axis = [k 
              for k in range(len(min_values.values)) if ((max_values.values[k]-df_2015_max_values.values[k])<0)]



#fig = plt.figure()
plt.plot(min_values.values,c='green',alpha = 1, label = 'Minimum Temperature (2005-14)')
plt.plot(max_values.values,c='red',alpha = 1, label = 'Maximum Temperature (2005-14)')
plt.scatter(record_min_axis,record_min, s = 12, marker = 'x',c = 'black'
            , label = 'Record Breaking Minimum (2015)')
plt.scatter(record_max_axis,record_max, s = 12, marker = 'x',c = 'black'
            , label = 'Record Breaking Maximum (2015)')
plt.gca().fill_between(range(len(max_values.values)), 
                       min_values.values, max_values.values, 
                       facecolor='grey', 
                       alpha=0.3)
x = plt.gca().xaxis
for item in x.get_ticklabels():
    item.set_rotation(75)
    
plt.subplots_adjust(bottom=0.25)
ax = plt.gca()
ax.set_xlabel('Month')
ax.set_ylabel('Temperature (*C)')
ax.set_title('Minimum & Maximum Temperature for Ann Arbor')
plt.legend(loc = 8, frameon=False, fontsize=8)
plt.xticks( np.linspace(15,15 + 30*11 , num = 12), 
           (r'January', r'February', r'March', r'April', r'May', r'June', r'July', r'August', 
            r'September', r'October', r'November', r'December') )
plt.ylim(-320, 420)
plt.show()
#fig.savefig('Min_Max_Temp.png') # saving figure




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


<IPython.core.display.Javascript object>

In [20]:
import calendar
print(calendar.isleap(2016))

True


# Subplots

[Back to top](#Basic-Plotting-with-matplotlib)

In [6]:
%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np

plt.subplot?

In [37]:
plt.figure()
# subplot with 1 row, 2 columns, and current axis is 1st subplot axes
plt.subplot(1, 2, 1)

linear_data = np.array([1,2,3,4,5,6,7,8])

plt.plot(linear_data, '-o')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x270d32bcb38>]

In [38]:
exponential_data = linear_data**2 

# subplot with 1 row, 2 columns, and current axis is 2nd subplot axes
plt.subplot(1, 2, 2)
plt.plot(exponential_data, '-o')

[<matplotlib.lines.Line2D at 0x270d3b25198>]

In [39]:
# plot exponential data on 1st subplot axes
plt.subplot(1, 2, 1)
plt.plot(exponential_data, '-x')

  "Adding an axes using the same arguments as a previous axes "


[<matplotlib.lines.Line2D at 0x270d3b241d0>]

In [40]:
plt.figure()
ax1 = plt.subplot(1, 2, 1)
plt.plot(linear_data, '-o')
# pass sharey=ax1 to ensure the two subplots share the same y axis
ax2 = plt.subplot(1, 2, 2, sharey=ax1)
plt.plot(exponential_data, '-x')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x270d325e240>]

In [41]:
plt.figure()
# the right hand side is equivalent shorthand syntax
plt.subplot(1,2,1) == plt.subplot(121)

<IPython.core.display.Javascript object>

True

In [63]:
# create a 3x3 grid of subplots
fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True)
# plot the linear_data on the 5th subplot axes 
ax5.plot(linear_data, '-')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x270d9d0cc50>]

In [64]:
# set inside tick labels to visible
for ax in plt.gcf().get_axes():
    for label in ax.get_xticklabels() + ax.get_yticklabels():
        label.set_visible(True)

In [65]:
# necessary on some systems to update the plot
plt.gcf().canvas.draw()

# Histograms

In [48]:
# create 2x2 grid of axis subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True)
axs = [ax1,ax2,ax3,ax4]

# draw n = 10, 100, 1000, and 10000 samples from the normal distribution and plot corresponding histograms
for n in range(0,len(axs)):
    sample_size = 10**(n+1)
    sample = np.random.normal(loc=0.0, scale=1.0, size=sample_size)
    axs[n].hist(sample)
    axs[n].set_title('n={}'.format(sample_size))

<IPython.core.display.Javascript object>

In [49]:
# repeat with number of bins set to 100
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True)
axs = [ax1,ax2,ax3,ax4]

for n in range(0,len(axs)):
    sample_size = 10**(n+1)
    sample = np.random.normal(loc=0.0, scale=1.0, size=sample_size)
    axs[n].hist(sample, bins=100)
    axs[n].set_title('n={}'.format(sample_size))

<IPython.core.display.Javascript object>

In [50]:
plt.figure()
Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
plt.scatter(X,Y)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x270d5e25d30>

In [51]:
# use gridspec to partition the figure into subplots
import matplotlib.gridspec as gridspec

plt.figure()
gspec = gridspec.GridSpec(3, 3)

top_histogram = plt.subplot(gspec[0, 1:])
side_histogram = plt.subplot(gspec[1:, 0])
lower_right = plt.subplot(gspec[1:, 1:])

<IPython.core.display.Javascript object>

In [52]:
Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
lower_right.scatter(X, Y)
top_histogram.hist(X, bins=100)
s = side_histogram.hist(Y, bins=100, orientation='horizontal')

In [74]:
# clear the histograms and plot normed histograms
top_histogram.clear()
top_histogram.hist(X, bins=100, density=True)
side_histogram.clear()
side_histogram.hist(Y, bins=100, orientation='horizontal', density=True)
# flip the side histogram's x axis
side_histogram.invert_xaxis()

In [54]:
# change axes limits
for ax in [top_histogram, lower_right]:
    ax.set_xlim(0, 1)
for ax in [side_histogram, lower_right]:
    ax.set_ylim(-5, 5)

In [76]:
%%HTML
<img src='https://static.projects.iq.harvard.edu/files/styles/os_files_large/public/vpl/files/edxbigquery4.png?m=1477691467&itok=E-mDs3Us' />

In [56]:
%%HTML
<img src='https://proxy.duckduckgo.com/iu/?u=http%3A%2F%2F3.bp.blogspot.com%2F-bm0XgU1za0Y%2FValVBzq5lEI%2FAAAAAAAAKz8%2FE9OMO9BY6_Y%2Fs1600%2FTesla-Model-S.jpg&f=1' />

# Box and Whisker Plots

In [3]:
import pandas as pd, numpy as np
normal_sample = np.random.normal(loc=0.0, scale=1.0, size=10000)
random_sample = np.random.random(size=10000)
gamma_sample = np.random.gamma(2, size=10000)

df = pd.DataFrame({'normal': normal_sample, 
                   'random': random_sample, 
                   'gamma': gamma_sample})

In [4]:
df.describe() # prints summary statistics

Unnamed: 0,normal,random,gamma
count,10000.0,10000.0,10000.0
mean,-0.001296,0.494894,1.981772
std,1.002416,0.289191,1.373762
min,-3.632149,0.000393,0.009318
25%,-0.676168,0.242658,0.974848
50%,-0.002112,0.492459,1.668064
75%,0.669229,0.747326,2.677425
max,4.049698,0.999968,12.345287


In [7]:
plt.figure()
# create a boxplot of the normal data, assign the output to a variable to supress output
_ = plt.boxplot(df['normal'], whis='range')

<IPython.core.display.Javascript object>

In [8]:
# clear the current figure
plt.clf()
# plot boxplots for all three of df's columns
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ], whis='range')

In [9]:
plt.figure()
_ = plt.hist(df['gamma'], bins=100)

<IPython.core.display.Javascript object>

In [10]:
import mpl_toolkits.axes_grid1.inset_locator as mpl_il # for embedding a graph within another graph

plt.figure()
plt.boxplot([ df['normal'], df['random'], df['gamma'] ], whis='range')
# overlay axis on top of another 
ax2 = mpl_il.inset_axes(plt.gca(), width='60%', height='40%', loc=2)
ax2.hist(df['gamma'], bins=100)
ax2.margins(x=0.5)

<IPython.core.display.Javascript object>

In [11]:
# switch the y axis ticks for ax2 to the right side
ax2.yaxis.tick_right()

In [12]:
# if `whis` argument isn't passed, boxplot defaults to showing 1.5*interquartile (IQR) whiskers with outliers
plt.figure()
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ] )

<IPython.core.display.Javascript object>

# Heatmaps

In [13]:
plt.figure()

Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
_ = plt.hist2d(X, Y, bins=25)

<IPython.core.display.Javascript object>

In [14]:
plt.figure()
_ = plt.hist2d(X, Y, bins=100)

<IPython.core.display.Javascript object>

In [15]:
# add a colorbar legend
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x165dc47ffd0>

# Animations

In [16]:
import matplotlib.animation as animation

n = 100
x = np.random.randn(n)# returns an array of elements from a normal distribution

In [26]:
np.arange(-4, 4, 0.5)

array([-4. , -3.5, -3. , -2.5, -2. , -1.5, -1. , -0.5,  0. ,  0.5,  1. ,
        1.5,  2. ,  2.5,  3. ,  3.5])

In [17]:
# create the function that will do the plotting, where curr is the current frame
def update(curr):
    # check if animation is at the last frame, and if so, stop the animation a
    if curr == n: 
        a.event_source.stop()
    plt.cla() # clear the current axis
    bins = np.arange(-4, 4, 0.5)
    plt.hist(x[:curr], bins=bins)# returns a histogram with 15 bins of the generated normal distribution
    plt.axis([-4,4,0,30])
    plt.gca().set_title('Sampling the Normal Distribution')
    plt.gca().set_ylabel('Frequency')
    plt.gca().set_xlabel('Value')
    plt.annotate('n = {}'.format(curr), [3,27]) #the second argument is the position for the annotation

In [57]:
fig = plt.figure()
a = animation.FuncAnimation(fig, update, interval=100) # interval refers to the delay in milliseconds between frames

<IPython.core.display.Javascript object>

# Interactivity

In [59]:
plt.figure()
data = np.random.rand(10)
plt.plot(data)

def onclick(event):
    plt.cla()
    plt.plot(data)
    plt.gca().set_title('Event at pixels {},{} \nand data {},{}'.format(event.x, event.y, event.xdata, event.ydata))

# tell mpl_connect we want to pass a 'button_press_event' into onclick when the event is detected
plt.gcf().canvas.mpl_connect('button_press_event', onclick)

<IPython.core.display.Javascript object>

6

In [62]:
from random import shuffle
origins = ['China', 'Brazil', 'India', 'USA', 'Canada', 'UK', 'Germany', 'Iraq', 'Chile', 'Mexico']

shuffle(origins) # randomly orders the names of countries

df = pd.DataFrame({'height': np.random.rand(10),
                   'weight': np.random.rand(10),
                   'origin': origins})
df

Unnamed: 0,height,weight,origin
0,0.519845,0.085668,UK
1,0.608095,0.715933,Mexico
2,0.878307,0.379528,Iraq
3,0.224643,0.394411,Germany
4,0.785336,0.316018,Chile
5,0.469151,0.233942,Canada
6,0.112548,0.349928,Brazil
7,0.468361,0.75877,India
8,0.887668,0.035342,China
9,0.988446,0.8982,USA


In [68]:
plt.figure()
# picker=5 means the mouse doesn't have to click directly on an event, but can be up to 5 pixels away
plt.scatter(df['height'], df['weight'], picker=15)
plt.gca().set_ylabel('Weight')
plt.gca().set_xlabel('Height')

<IPython.core.display.Javascript object>

Text(0.5, 0, 'Height')

In [69]:
def onpick(event):
    origin = df.iloc[event.ind[0]]['origin']
    plt.gca().set_title('Selected item came from {}'.format(origin))

# tell mpl_connect we want to pass a 'pick_event' into onpick when the event is detected
plt.gcf().canvas.mpl_connect('pick_event', onpick)

6

# Assignment 3 - Building a Custom Visualization

---

In this assignment you must choose one of the options presented below and submit a visual as well as your source code for peer grading. The details of how you solve the assignment are up to you, although your assignment must use matplotlib so that your peers can evaluate your work. The options differ in challenge level, but there are no grades associated with the challenge level you chose. However, your peers will be asked to ensure you at least met a minimum quality for a given technique in order to pass. Implement the technique fully (or exceed it!) and you should be able to earn full grades for the assignment.


&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Ferreira, N., Fisher, D., & Konig, A. C. (2014, April). [Sample-oriented task-driven visualizations: allowing users to make better, more confident decisions.](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/Ferreira_Fisher_Sample_Oriented_Tasks.pdf) 
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;In Proceedings of the SIGCHI Conference on Human Factors in Computing Systems (pp. 571-580). ACM. ([video](https://www.youtube.com/watch?v=BI7GAs-va-Q))


In this [paper](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/Ferreira_Fisher_Sample_Oriented_Tasks.pdf) the authors describe the challenges users face when trying to make judgements about probabilistic data generated through samples. As an example, they look at a bar chart of four years of data (replicated below in Figure 1). Each year has a y-axis value, which is derived from a sample of a larger dataset. For instance, the first value might be the number votes in a given district or riding for 1992, with the average being around 33,000. On top of this is plotted the 95% confidence interval for the mean (see the boxplot lectures for more information, and the yerr parameter of barcharts).

<br>
![Figure 1](D:\Coursera\Applied Data Science with Python\Applied Plotting, Charting & Data Representation\Assignment3Fig2c.png)
<img src="D:/Coursera/Applied Data Science with Python/Applied Plotting, Charting & Data Representation/Assignment3Fig1.png" alt="Figure 1" style="width: 400px;"/>
<h4 style="text-align: center;" markdown="1">  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Figure 1 from (Ferreira et al, 2014).</h4>

<br>

A challenge that users face is that, for a given y-axis value (e.g. 42,000), it is difficult to know which x-axis values are most likely to be representative, because the confidence levels overlap and their distributions are different (the lengths of the confidence interval bars are unequal). One of the solutions the authors propose for this problem (Figure 2c) is to allow users to indicate the y-axis value of interest (e.g. 42,000) and then draw a horizontal line and color bars based on this value. So bars might be colored red if they are definitely above this value (given the confidence interval), blue if they are definitely below this value, or white if they contain this value.


<br>
<img src="D:\Coursera\Applied Data Science with Python\Applied Plotting, Charting & Data Representation\Assignment3Fig2c.png" alt="Figure 2" style="width: 400px;"/>
<h4 style="text-align: center;" markdown="1">  Figure 2c from (Ferreira et al. 2014). Note that the colorbar legend at the bottom as well as the arrows are not required in the assignment descriptions below.</h4>

<br>
<br>

**Easiest option:** Implement the bar coloring as described above - a color scale with only three colors, (e.g. blue, white, and red). Assume the user provides the y axis value of interest as a parameter or variable.


**Harder option:** Implement the bar coloring as described in the paper, where the color of the bar is actually based on the amount of data covered (e.g. a gradient ranging from dark blue for the distribution being certainly below this y-axis, to white if the value is certainly contained, to dark red if the value is certainly not contained as the distribution is above the axis).

**Even Harder option:** Add interactivity to the above, which allows the user to click on the y axis to set the value of interest. The bar colors should change with respect to what value the user has selected.

**Hardest option:** Allow the user to interactively set a range of y values they are interested in, and recolor based on this (e.g. a y-axis band, see the paper for more details).

---

*Note: The data given for this assignment is not the same as the data used in the article and as a result the visualizations may look a little different.*

In [76]:
# Use the following data for this assignment:

import pandas as pd,matplotlib.pyplot as plt,numpy as np
#importing colour map(cm) from matplotlib 
from matplotlib import cm

np.random.seed(12345)

df = pd.DataFrame([np.random.normal(32000,200000,3650), 
                   np.random.normal(43000,100000,3650), 
                   np.random.normal(43500,140000,3650), 
                   np.random.normal(48000,70000,3650)], 
                  index=[1992,1993,1994,1995])
df_copy = df.copy()
df_copy = df_copy.T

In [78]:
# dataframe to have all the descriptive statistics
a =pd.DataFrame(df_copy.describe()) 
# estimating the confidence interval
conf_min = []
conf_max = []
error = []
for h in range(4):
    stdev = (a.loc['std',h+1992])
    aver = (a.loc['mean',h+1992])
    counts = (a.loc['count',h+1992])
    conf_min.append((aver-((stdev/np.sqrt(counts))*1.96))) # dividing the population standard error by sqrt of sample 
    #to derive at the sample standard deviation
    conf_max.append((aver+((stdev/np.sqrt(counts))*1.96)))
    error.append((stdev/np.sqrt(counts)*1.96))
    
# calculating the shades and colours to use for the bar charts. the y axis at which the user wants to check is fixed here
nearest =100
y= 39500
# creates a new dataframe returning the shades of colour to use
df_p = pd.DataFrame(index = df.index.values)
for i in range(4):
    df_p.loc[i+1992,'Diff'] = nearest*((y-a.loc['mean',i+1992])//nearest)
old_range = min(abs(df_p['Diff'])),abs(max(df_p['Diff']))
new_range = .5,1
for j in range(4):
    df_p.loc[j+1992,'Sign'] = abs(df_p.loc[j+1992,'Diff'])/(df_p.loc[j+1992,'Diff'])
    df_p.loc[j+1992,'Shade']=df_p.loc[j+1992,'Sign']*np.interp(abs(df_p.loc[j+1992,'Diff']),old_range,new_range)
df_p['Select Colour'] = df_p['Shade']
df_p['Sign'] = df_p['Sign'].apply(lambda x: 0 if pd.isnull(x) == True else x)
df_p['Shade'] = df_p['Shade'].apply(lambda x: 0 if pd.isnull(x) == True else x)
df_p['Select Colour'] = df_p['Shade'].apply( lambda x: 'white' if x == 0 
                                            else 'use reds cmap to get colour' if x<0 
                                            else 'use blues cmap to get colour')



#values = np.array([a.loc['mean',1992],a.loc['mean',1993],a.loc['mean',1994],a.loc['mean',1995]])
shade = df_p['Shade']
#importing different gradients to red and blue from matplotlib colour map(cm)
reds = cm.Reds
blues = cm.Blues
colour = ['white' if x==0 else reds(abs(x)) if x<0 else blues(abs(x)) for x in shade]
'''plt.figure()
plt.bar(range(0,len(values)),values,edgecolor = 'black',color=colour)
plt.axhline(y=y,color='grey',ls='--')
plt.show()'''

plt.figure()
plt.bar(df.index.values,np.array([np.mean(df_copy[1992]),np.mean(df_copy[1993]),
                                  np.mean(df_copy[1994]),np.mean(df_copy[1995])]),edgecolor = 'black',
        color=colour,yerr=error,
       tick_label = (r'1992',r'1993',r'1994',r'1995'),capsize=7)
plt.axhline(y=y,color='grey',ls='--')
plt.xlabel('Years')
plt.ylabel('# of Cigarettes')
plt.title('Average Cigarette Sales per Month in Bangalore (1992-1995)')
plt.show()



<IPython.core.display.Javascript object>

In [79]:
values = np.array([1.7,1.3,.84,.95,1])
y=1.01
shade = [-.99,-.5,.8,.5,0]
from matplotlib import cm
reds = cm.Reds
blues = cm.Blues
colour = ['white' if x==0 else reds(abs(x)) if x<0 else blues(abs(x)) for x in shade]
plt.figure()
plt.bar(range(1,len(values)+1),values,edgecolor = 'black',color=colour)
plt.axhline(y=y,color='grey',ls='--')
plt.show()

<IPython.core.display.Javascript object>

In [80]:
nearest =100
y= 39500
df_p = pd.DataFrame(index = df.index.values)
for i in range(4):
    df_p.loc[i+1992,'Diff'] = nearest*((y-a.loc['mean',i+1992])//nearest)
old_range = min(abs(df_p['Diff'])),abs(max(df_p['Diff']))
new_range = .5,1
for j in range(4):
    df_p.loc[j+1992,'Sign'] = abs(df_p.loc[j+1992,'Diff'])/(df_p.loc[j+1992,'Diff'])
    df_p.loc[j+1992,'Shade']=df_p.loc[j+1992,'Sign']*np.interp(abs(df_p.loc[j+1992,'Diff']),old_range,new_range)
df_p['Select Colour'] = df_p['Shade']
df_p['Sign'] = df_p['Sign'].apply(lambda x: 0 if pd.isnull(x) == True else x)
df_p['Shade'] = df_p['Shade'].apply(lambda x: 0 if pd.isnull(x) == True else x)
df_p['Select Colour'] = df_p['Shade'].apply( lambda x: 'white' if x == 0 
                                                                          else 'use reds cmap to get colour' if x<0 else 
                                                                         'use blues cmap to get colour')


df_p

  if __name__ == '__main__':


Unnamed: 0,Diff,Sign,Shade,Select Colour
1992,6100.0,1.0,1.0,use blues cmap to get colour
1993,-2400.0,-1.0,-0.696721,use reds cmap to get colour
1994,0.0,0.0,0.0,white
1995,-8300.0,-1.0,-1.0,use reds cmap to get colour


In [81]:
values = np.array([a.loc['mean',1992],a.loc['mean',1993],a.loc['mean',1994],a.loc['mean',1995]])
y=39500
shade = df_p['Shade']
from matplotlib import cm
reds = cm.Reds
blues = cm.Blues
colour = ['white' if x==0 else reds(abs(x)) if x<0 else blues(abs(x)) for x in shade]
plt.figure()
plt.bar(range(0,len(values)),values,edgecolor = 'black',color=colour)
plt.axhline(y=y,color='grey',ls='--')
plt.show()

<IPython.core.display.Javascript object>

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib notebook')

np.random.seed(12345)

df = pd.DataFrame([np.random.normal(32000,200000,3650), 
                   np.random.normal(43000,100000,3650), 
                   np.random.normal(43500,140000,3650), 
                   np.random.normal(48000,70000,3650)], 
                  index=[1992,1993,1994,1995])

df = df.transpose()

df.describe()

# import the t-student distribution statistics for y_error calculation
from scipy.stats import t
conf = 0.95

year_avg = df.mean()
year_std = df.std()
y_error = year_std / np.sqrt(df.shape[0]) * t.ppf(1-(1-conf)/2, df.shape[0]-1)
color_blue = pd.Series([26/255, 35/255, 126/255, 0])
color_red = pd.Series([183/255, 28/255, 28/255, 0])

# the default barplot configuration
y = year_avg.mean()
fig = plt.figure()
barplot = plt.bar(df.columns, year_avg, yerr=y_error)
horizontal_line = plt.axhline(y, color='gray')
line_label = plt.text(df.columns[0] + (df.columns[1]- df.columns[0])*.1, y,
                      "{0:.0f}".format(y),
                      bbox={"facecolor": "white", "boxstyle": "round"})
ticks = plt.xticks(df.columns, list(df))

# we need the bars colored adequately on start
bars = barplot.get_children()
for bar, avg, y_err in zip(bars, year_avg, y_error):
        
        # sets the 'red' and 'blue' depending on whether respective condition is True and applies faded alpha
        bar_color = pd.Series([0, 0, 0, min(1, abs(y - avg)/y_err)])        
        if (y < avg)>0 :
            bar_color += color_red
        else:
            bar_color += color_blue
        bar.set_color(bar_color)
        bar.set_edgecolor("black")


def onclick(event):
    y = event.ydata # gets the "y" coordinate of the mouseclick
    horizontal_line.set_ydata(y)
    line_label.set_y(y)
    line_label.set_text("{:.0f}".format(y))    
    # iterates through the bars and applies new coloring
    for bar, avg, y_err in zip(bars, year_avg, y_error):
        # sets the 'red' and 'blue' depending on whether respective condition is True and applies faded alpha
        bar_color = pd.Series([0, 0, 0, min(1, abs(y - avg)/y_err)])        
        if (y < avg)>0 :
            bar_color += color_red
        else:
            bar_color += color_blue
        bar.set_color(bar_color)
        bar.set_edgecolor("black")

# go interactive!
plt.gcf().canvas.mpl_connect('button_press_event', onclick)


<IPython.core.display.Javascript object>

6

# Pandas Visualization

[Back to top](#Basic-Plotting-with-matplotlib)

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib notebook

In [84]:
# see the pre-defined styles provided.
plt.style.available

['bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark-palette',
 'seaborn-dark',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'seaborn',
 'Solarize_Light2',
 'tableau-colorblind10',
 '_classic_test']

In [164]:
# use the 'seaborn-colorblind' style for visually impaired people
plt.style.use('seaborn-colorblind')

### DataFrame.plot

In [165]:
np.random.seed(123)

df = pd.DataFrame({'A': np.random.randn(365).cumsum(0), 
                   'B': np.random.randn(365).cumsum(0) + 20,
                   'C': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('1/1/2017', periods=365))
df.head()

Unnamed: 0,A,B,C
2017-01-01,-1.085631,20.059291,-20.230904
2017-01-02,-0.088285,21.803332,-16.659325
2017-01-03,0.194693,20.835588,-17.055481
2017-01-04,-1.311601,21.255156,-17.093802
2017-01-05,-1.890202,21.462083,-19.518638


In [176]:
df.plot(); # add a semi-colon to the end of the plotting call to suppress unwanted output

<IPython.core.display.Javascript object>

We can select which plot we want to use by passing it into the 'kind' parameter.

In [177]:
df.plot('A','B', kind = 'scatter');

<IPython.core.display.Javascript object>

You can also choose the plot kind by using the `DataFrame.plot.kind` methods instead of providing the `kind` keyword argument.

`kind` :
- `'line'` : line plot (default)
- `'bar'` : vertical bar plot
- `'barh'` : horizontal bar plot
- `'hist'` : histogram
- `'box'` : boxplot
- `'kde'` : Kernel Density Estimation plot
- `'density'` : same as 'kde'
- `'area'` : area plot
- `'pie'` : pie plot
- `'scatter'` : scatter plot
- `'hexbin'` : hexbin plot

In [178]:
# create a scatter plot of columns 'A' and 'C', with changing color (c) and size (s) based on column 'B'
df.plot.scatter('A', 'C', c='B', s=df['B'], colormap='viridis')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x17d628d88d0>

In [179]:
ax = df.plot.scatter('A', 'C', c='B', s=df['B'], colormap='viridis')
ax.set_aspect('equal') # gives you the range of column A

<IPython.core.display.Javascript object>

In [180]:
df.plot.box();

<IPython.core.display.Javascript object>

In [181]:
df.plot.hist(alpha=0.7);

<IPython.core.display.Javascript object>

[Kernel density estimation plots](https://en.wikipedia.org/wiki/Kernel_density_estimation) are useful for deriving a smooth continuous function from a given sample.

In [182]:
df.plot.kde();

<IPython.core.display.Javascript object>

### pandas.plotting

Other forms of plotting in pandas

* [Scatter Matrix](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-scatter-matrix)
* [Andrews Curves](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-andrews-curves)
* [Parallel Coordinates](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-parallel-coordinates)
* [Lag Plot](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-lag)
* [Autocorrelation Plot](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-autocorrelation)
* [Bootstrap Plot](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-bootstrap)
* [RadViz](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-radviz)

[Iris flower data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)

In [183]:
iris = pd.read_csv(r'D:\Coursera\Applied Data Science with Python\Applied Plotting, Charting & Data Representation'+\
                   '\iris.csv')
iris.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [186]:
pd.plotting.scatter_matrix(iris);

<IPython.core.display.Javascript object>

In [187]:
plt.figure()
pd.plotting.parallel_coordinates(iris, 'Name');
# Using parallel coordinates points are represented as connected line segments. 
# Each vertical line represents one attribute. One set of connected line segments represents one data point. 

<IPython.core.display.Javascript object>

# Seaborn

In [188]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib notebook

In [189]:
np.random.seed(1234)

v1 = pd.Series(np.random.normal(0,10,1000), name='v1')
v2 = pd.Series(2*v1 + np.random.normal(60,15,1000), name='v2')

In [190]:
plt.figure()
plt.hist(v1, alpha=0.7, bins=np.arange(-50,150,5), label='v1');
plt.hist(v2, alpha=0.7, bins=np.arange(-50,150,5), label='v2');
plt.legend();

<IPython.core.display.Javascript object>

In [192]:
# plot a kernel density estimation over a stacked barchart
plt.figure()
plt.hist([v1, v2], histtype='barstacked', density=True);
v3 = np.concatenate((v1,v2))
sns.kdeplot(v3);

<IPython.core.display.Javascript object>

In [193]:
plt.figure()
# we can pass keyword arguments for each individual component of the plot
sns.distplot(v3, hist_kws={'color': 'Teal'}, kde_kws={'color': 'Navy'});

<IPython.core.display.Javascript object>

In [194]:
sns.jointplot(v1, v2, alpha=0.4);

<IPython.core.display.Javascript object>

In [195]:
grid = sns.jointplot(v1, v2, alpha=0.4);
grid.ax_joint.set_aspect('equal')

<IPython.core.display.Javascript object>

In [196]:
sns.jointplot(v1, v2, kind='hex');

<IPython.core.display.Javascript object>

In [197]:
# set the seaborn style for all the following plots
sns.set_style('white')

sns.jointplot(v1, v2, kind='kde', space=0);

<IPython.core.display.Javascript object>

In [198]:
iris = pd.read_csv(r'D:\Coursera\Applied Data Science with Python\Applied Plotting, Charting & Data Representation'+\
                   '\iris.csv')
iris.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [200]:
sns.pairplot(iris, hue='Name', diag_kind='kde', height=2);

<IPython.core.display.Javascript object>

In [201]:
plt.figure(figsize=(8,6))
plt.subplot(121)
sns.swarmplot('Name', 'PetalLength', data=iris);
plt.subplot(122)
sns.violinplot('Name', 'PetalLength', data=iris);

<IPython.core.display.Javascript object>

#### Scrapping Data from Basketball reference

In [108]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

url_template = "http://www.basketball-reference.com/draft/NBA_{year}.html"

# create an empty DataFrame
draft_df = pd.DataFrame()

column_headers = [th.getText() for th in 
                  soup.findAll('tr', limit=2)[1].findAll('th')]
column_headers.remove(column_headers[0])

for year in range(1966, 2015):  # for each year
    url = url_template.format(year=year)  # get the url
    
    html = urlopen(url)  # get the html
    soup = BeautifulSoup(html, 'html5lib') # create our BS object
    

    # get our player data
    data_rows = soup.findAll('tr')[2:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
                for i in range(len(data_rows))]
    
    # Turn yearly data into a DatFrame
    year_df = pd.DataFrame(player_data, columns=column_headers)
    # create and insert the Draft_Yr column
    year_df.insert(0, 'Draft_Yr', year)
    
    # Append to the big dataframe
    draft_df = draft_df.append(year_df, ignore_index=True)

In [112]:
# Convert data to proper data types
draft_df = draft_df.convert_objects(convert_numeric=True)

# Get rid of the rows full of null values
draft_df = draft_df[draft_df.Player.notnull()]

# Replace NaNs with 0s
draft_df = draft_df.fillna(0)

# Rename Columns
draft_df.rename(columns={'WS/48':'WS_per_48'}, inplace=True)
# Change % symbol
draft_df.columns = draft_df.columns.str.replace('%', '_Perc')
# Add per_G to per game stats
draft_df.columns.values[15:19] = [draft_df.columns.values[15:19][col] + 
                                  "_per_G" for col in range(4)]

# Changing the Data Types to int
draft_df.loc[:,'Yrs':'AST'] = draft_df.loc[:,'Yrs':'AST'].astype(int)

# Delete the 'Rk' column
#draft_df.drop('Rk', axis='columns', inplace=True)

draft_df['Pk'] = draft_df['Pk'].astype(int) # change Pk to int

draft_df.isnull().sum() # No missing values in our DataFrame



For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


Draft_Yr           0
Pk                 0
Tm                 0
Player             0
College            0
Yrs                0
G                  0
MP                 0
PTS                0
TRB                0
AST                0
FG_Perc            0
3P_Perc            0
FT_Perc            0
MP                 0
PTS_per_G_per_G    0
TRB_per_G_per_G    0
AST_per_G_per_G    0
WS_per_G_per_G     0
WS_per_48          0
BPM                0
VORP               0
dtype: int64

#### Scrapping Basketball data from ESPN

In [1]:
import urllib.request, urllib.parse, urllib.error, pandas as pd, re, ssl, numpy as np,import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
%matplotlib notebook 

# Ignore Secure Socket Layer (SSL) certificate errors; this part is required for many web pages otherwise it won't pick up
# bad certificate tags
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# dataframe columns
url = "http://www.espn.com/nba/standings/_/season/2017"
html = urllib.request.urlopen(url, context=ctx).read() # here context is calling the ssl certificate fix
webcrawler = BeautifulSoup(html,'html.parser')
column_names = [i.get_text() for i in webcrawler.find_all('tr', class_='Table2__header-row Table2__tr Table2__even')[1].
                find_all('span')]
column_names.remove('')

# loop for all the years

url_template = "http://www.espn.com/nba/standings/_/season/{year}"
espn_bb_data = pd.DataFrame()


for year in range(2003, 2018):
    url = url_template.format(year=year)
    html = urllib.request.urlopen(url, context=ctx).read() # here context is calling the ssl certificate fix
    webcrawler = BeautifulSoup(html,'html.parser')
    team_data = []
    team_name = []
    
    
    for j in range(15):
        team_data.append([i.get_text() for i in webcrawler.find_all('tbody', class_='Table2__tbody')[1].
                          find_all('tr')[j].find_all('td')])
    for j in range(15):
        team_name.append([i.get_text() for i in webcrawler.find_all('tbody', class_='Table2__tbody')[0].
                          find_all('tr')[j].find_all('span', class_='hide-mobile')])
        team_name[j]= ''.join(str(letters) for letters in team_name[j]) # converting from list to string
    espn_bb_data_temp = pd.DataFrame(data = team_data, columns=column_names)
    espn_bb_data_temp.index +=1
    espn_bb_data_temp['Year'] = year
    espn_bb_data_temp['Rank'] = espn_bb_data_temp.index
    espn_bb_data_temp['Team'] = (team_name)
    espn_bb_data=espn_bb_data.append(espn_bb_data_temp) 


espn_bb_data = espn_bb_data.iloc[:,[15]+[i for i in np.arange(1,15)]] # rearranging the columns

espn_bb_data




Unnamed: 0,Team,L,PCT,GB,HOME,AWAY,DIV,CONF,PPG,OPP PPG,DIFF,STRK,L10,Year,Rank
1,Detroit Pistons,32,.610,-,30-11,20-21,19-9,0-0,91.4,87.7,+3.7,L1,4-6,2003,1
2,New Jersey Nets,33,.598,1,33-8,16-25,16-8,0-0,95.4,90.1,+5.2,L2,5-5,2003,2
3,Indiana Pacers,34,.585,2,32-9,16-25,19-9,0-0,96.8,93.3,+3.4,W2,6-4,2003,3
4,Philadelphia 76ers,34,.585,2,25-16,23-18,17-7,0-0,96.8,94.5,+2.3,W1,5-5,2003,4
5,New Orleans Hornets,35,.573,3,29-12,18-23,17-11,0-0,93.9,91.8,+2.1,W5,7-3,2003,5
6,Boston Celtics,38,.537,6,25-16,19-22,13-12,0-0,92.7,93.0,-0.4,W2,6-4,2003,6
7,Milwaukee Bucks,40,.512,8,25-16,17-24,16-12,0-0,99.5,99.3,+0.2,W4,8-2,2003,7
8,Orlando Magic,40,.512,8,26-15,16-25,14-11,0-0,98.5,98.4,+0.1,L2,4-6,2003,8
9,Washington Wizards,45,.451,13,23-18,14-27,11-13,0-0,91.5,92.5,-1.0,L3,3-7,2003,9
10,New York Knicks,45,.451,13,24-17,13-28,9-15,0-0,95.9,97.2,-1.4,L1,5-5,2003,10


Storing data so that we don't have to scrape data multiple times

In [31]:
espn_bb_data_copy.to_csv('D:\Coursera\Applied Data Science with Python\Applied Plotting, Charting & Data Representation'+\
'\espnbbdata.txt' ,index=False, sep=',')

Reading the stored data to check whether the data has been stored correctly

In [33]:
trial_read = pd.read_csv(r'D:\Coursera\Applied Data Science with Python\Applied Plotting, Charting & Data Representation'+\
                         '\espnbbdata.txt',sep=',')
trial_read

Unnamed: 0,Team,L,PCT,GB,HOME,AWAY,DIV,CONF,PPG,OPP PPG,DIFF,STRK,L10,Year,Rank
0,Detroit Pistons,32,0.610,-,30-11,20-21,19-9,0-0,91.4,87.7,3.7,L1,4-6,2003,1
1,New Jersey Nets,33,0.598,1,33-8,16-25,16-8,0-0,95.4,90.1,5.2,L2,5-5,2003,2
2,Indiana Pacers,34,0.585,2,32-9,16-25,19-9,0-0,96.8,93.3,3.4,W2,6-4,2003,3
3,Philadelphia 76ers,34,0.585,2,25-16,23-18,17-7,0-0,96.8,94.5,2.3,W1,5-5,2003,4
4,New Orleans Hornets,35,0.573,3,29-12,18-23,17-11,0-0,93.9,91.8,2.1,W5,7-3,2003,5
5,Boston Celtics,38,0.537,6,25-16,19-22,13-12,0-0,92.7,93.0,-0.4,W2,6-4,2003,6
6,Milwaukee Bucks,40,0.512,8,25-16,17-24,16-12,0-0,99.5,99.3,0.2,W4,8-2,2003,7
7,Orlando Magic,40,0.512,8,26-15,16-25,14-11,0-0,98.5,98.4,0.1,L2,4-6,2003,8
8,Washington Wizards,45,0.451,13,23-18,14-27,11-13,0-0,91.5,92.5,-1.0,L3,3-7,2003,9
9,New York Knicks,45,0.451,13,24-17,13-28,9-15,0-0,95.9,97.2,-1.4,L1,5-5,2003,10


In [11]:
espn_bb_data_copy = espn_bb_data.copy()

In [16]:
espn_bb_data_copy.dtypes

Team        object
L            int64
PCT        float64
GB          object
HOME        object
AWAY        object
DIV         object
CONF        object
PPG        float64
OPP PPG    float64
DIFF       float64
STRK        object
L10         object
Year         int64
Rank         int64
dtype: object

In [13]:
espn_bb_data_copy = espn_bb_data_copy.apply(pd.to_numeric,errors='ignore')

In [37]:
espn_bb_data_copy['Team'].unique()

array(['Detroit Pistons', 'New Jersey Nets', 'Indiana Pacers',
       'Philadelphia 76ers', 'New Orleans Hornets', 'Boston Celtics',
       'Milwaukee Bucks', 'Orlando Magic', 'Washington Wizards',
       'New York Knicks', 'Atlanta Hawks', 'Chicago Bulls', 'Miami Heat',
       'Toronto Raptors', 'Cleveland Cavaliers', 'Charlotte Bobcats',
       'Brooklyn Nets', 'Charlotte Hornets'], dtype=object)

In [159]:
teams_selected=np.array(['Detroit Pistons', 'Boston Celtics','Chicago Bulls'] )

In [160]:
plot_data = pd.DataFrame(espn_bb_data_copy.set_index('Team').loc[teams_selected].sort_values(by=['Year'])) 

In [222]:
plt.figure()
for i in range(len(teams_selected)):
    a= plot_data.loc[teams_selected[i]]['Year']
    b= plot_data.loc[teams_selected[i]]['Rank']
    plt.plot(a,b,marker = '*',ls='--',ms=4,lw=0.2,label=teams_selected[i])
plt.legend(loc =1, frameon=True, fontsize=5.7)

for item in plt.gca().xaxis.get_ticklabels():
    item.set_rotation(75)
    
plt.subplots_adjust(bottom=0.25)
plt.xticks( np.arange(2003,2018), 
           (r'2002-03', r'2003-04', r'2004-05', r'2005-06', r'2006-07', r'2007-08', r'2008-09', r'2009-10', 
            r'2010-11', r'2011-12', r'2012-13', r'2013-14', r'2014-15', r'2015-16', r'2016-17') )
plt.yticks(np.arange(1,16))
plt.axhline(y=8, ls='--',color='black',lw=0.4)
plt.text(2011,8.2,'NBA Playoffs',size=7,color='red')
'''line_label = plt.text(df.columns[0] + (df.columns[1]- df.columns[0])*.1, y,
                      "{0:.0f}".format(y),
                      bbox={"facecolor": "white", "boxstyle": "round"})
line_label.set_y(y)
line_label.set_text("{:.0f}".format(y)) '''
plt.xlabel('Season')
plt.ylabel('Rank')
plt.title('Eastern Conference NBA Standings')

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Eastern Conference NBA Standings')

In [207]:
season_dates= np.arange(2002,2018)
for j in (enumerate(season_dates)):
    print('r'+'\''+str(j[1])+'-'+str(j[1]+1)[-2:]+'\'')

r'2002-03'
r'2003-04'
r'2004-05'
r'2005-06'
r'2006-07'
r'2007-08'
r'2008-09'
r'2009-10'
r'2010-11'
r'2011-12'
r'2012-13'
r'2013-14'
r'2014-15'
r'2015-16'
r'2016-17'
r'2017-18'


In [313]:
column_names = [i.get_text() for i in webcrawler.find_all('tr', class_='Table2__header-row Table2__tr Table2__even')[1].
                find_all('span')]
column_names.remove('')


In [385]:
team_data = []
for j in range(15):
    team_data.append([i.get_text() for i in webcrawler.find_all('tbody', class_='Table2__tbody')[1].
 find_all('tr')[j].find_all('td')])


In [386]:
team_name = []
for j in range(15):
    team_name.append([i.get_text() for i in webcrawler.find_all('tbody', class_='Table2__tbody')[0].
 find_all('tr')[j].find_all('span', class_='hide-mobile')])
    team_name[j]= ''.join(str(letters) for letters in team_name[j]) # converting from list to string


In [387]:
(team_name)

['Indiana Pacers',
 'New Jersey Nets',
 'Detroit Pistons',
 'Miami Heat',
 'New Orleans Hornets',
 'Milwaukee Bucks',
 'New York Knicks',
 'Boston Celtics',
 'Cleveland Cavaliers',
 'Toronto Raptors',
 'Philadelphia 76ers',
 'Atlanta Hawks',
 'Washington Wizards',
 'Chicago Bulls',
 'Orlando Magic']

In [375]:
espn_bb_data = pd.DataFrame(data=team_data,columns=column_names)
espn_bb_data['Team'] = (team_name)
espn_bb_data.index +=1
espn_bb_data = espn_bb_data.iloc[:,[13]+[i for i in np.arange(0,13)]] # rearranging the columns
espn_bb_data['Rank'] = espn_bb_data.index
espn_bb_data

Unnamed: 0,Team,W,L,PCT,GB,HOME,AWAY,DIV,CONF,PPG,OPP PPG,DIFF,STRK,L10,Rank
1,Boston Celtics,53,29,0.646,-,30-11,23-18,11-5,36-16,108.0,105.4,2.7,W3,7-3,1
2,Cleveland Cavaliers,51,31,0.622,2,31-10,20-21,8-8,35-17,110.3,107.2,3.2,L4,4-6,2
3,Toronto Raptors,51,31,0.622,2,28-13,23-18,14-2,34-18,106.9,102.6,4.2,W4,8-2,3
4,Washington Wizards,49,33,0.598,4,30-11,19-22,8-8,32-20,109.2,107.4,1.8,L1,5-5,4
5,Atlanta Hawks,43,39,0.524,10,23-18,20-21,6-10,30-22,103.2,104.0,-0.9,L1,6-4,5
6,Milwaukee Bucks,42,40,0.512,11,23-18,19-22,10-6,27-25,103.6,103.8,-0.2,L1,5-5,6
7,Indiana Pacers,42,40,0.512,11,29-12,13-28,8-8,26-26,105.1,105.3,-0.2,W5,6-4,7
8,Chicago Bulls,41,41,0.5,12,25-16,16-25,9-7,28-24,102.9,102.4,0.4,W2,7-3,8
9,Miami Heat,41,41,0.5,12,23-18,18-23,9-7,27-25,103.2,102.1,1.1,W3,6-4,9
10,Detroit Pistons,37,45,0.451,16,24-17,13-28,5-11,21-31,101.3,102.5,-1.1,L2,3-7,10


In [295]:
[ i. get_text() for i in 
 webcrawler.find_all('tbody', class_='Table2__tbody')[0].find_all('tr')[0].find_all('span', class_='hide-mobile')]

['Boston Celtics']

In [281]:
print(webcrawler.prettify())

<!DOCTYPE doctype html>
<html lang="en">
 <head>
  <!-- FITT|1911/90b3aeb9987509f2ae76c2f7b702db74b3b0cebb|ad4425c2681a|www.espn.com|Sun, 10 Mar 2019 10:07:15 GMT -->
  <title data-react-helmet="true">
   2016-17 NBA Standings | ESPN
  </title>
  <meta content="Visit ESPN to view the 2016-17 NBA Standings" data-react-helmet="true" name="description"/>
  <meta content="NBA, Basketball, Standings, ESPN" data-react-helmet="true" name="keywords"/>
  <meta content="116656161708917" data-react-helmet="true" property="fb:app_id"/>
  <meta content="ESPN" data-react-helmet="true" property="og:site_name"/>
  <meta content="http://www.espn.com/nba/standings/_/season/2017" data-react-helmet="true" property="og:url"/>
  <meta content="2016-17 NBA Standings | ESPN" data-react-helmet="true" property="og:title"/>
  <meta content="Visit ESPN to view the 2016-17 NBA Standings" data-react-helmet="true" property="og:description"/>
  <meta content="https://a.espncdn.com/combiner/i?img=/i/espn/misc_logos/50