In [83]:
#Histogram - A histogram is a graphical display of data using bars of different heights. 
#In a histogram, each bar groups numbers into ranges. 
#Taller bars show that more data falls in that range.
#A histogram displays the shape and spread of continuous sample data.

#Histograms are column-charts, which each column represents a range of the values, 
#and the height of a column corresponds to how many values are in that range.

%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np

plt.figure()
data= np.random.normal(0,1, 100000) #mean, std deviation, sample size
plt.hist(data)

<IPython.core.display.Javascript object>

(array([1.2000e+01, 2.1600e+02, 2.1520e+03, 1.1386e+04, 2.8051e+04,
        3.3645e+04, 1.8774e+04, 5.0710e+03, 6.5500e+02, 3.8000e+01]),
 array([-4.64608053, -3.75806141, -2.8700423 , -1.98202318, -1.09400407,
        -0.20598495,  0.68203416,  1.57005328,  2.45807239,  3.34609151,
         4.23411063]),
 <a list of 10 Patch objects>)

In [74]:
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2,sharex=True) #rows,columns, 

# draw n = 10, 100, 1000, and 10000 samples from the normal distribution and plot corresponding histograms
ax=[ax1,ax2,ax3,ax4]
for n in range(0,len(ax)):
    sample_size = 10**(n+1)
    sample = np.random.normal(0,1,sample_size)
    ax[n].hist(sample)
    ax[n].set_title('n={}'.format(sample_size))
    
    
    

<IPython.core.display.Javascript object>

In [78]:
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2,sharex=True)
# repeat with number of bins set to 100

#split the data into intervals, called bins

#If you have a small amount of data, use wider bins to eliminate noise. 
#If you have a lot of data, use narrower bins because the histogram will not be that noisy.


ax=[ax1,ax2,ax3,ax4]
for n in range(0,len(ax)):
    sample_size = 10**(n+1)
    sample = np.random.normal(0,1,sample_size)
    ax[n].hist(sample, bins=100)
    ax[n].set_title('n={}'.format(sample_size))

<IPython.core.display.Javascript object>

In [29]:
plt.figure()
Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
plt.scatter(X,Y)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x9895350>

In [84]:
# use gridspec to partition the figure into subplots

import matplotlib.gridspec as gridspec

plt.figure()
gspec = gridspec.GridSpec(3, 3)
#  0 1 2
#0(0,0,0)
#1(0,0,0)
#2(0,0,0)

top_histogram = plt.subplot(gspec[0, 1:]) #row, column
side_histogram = plt.subplot(gspec[1:, 0])
lower_right = plt.subplot(gspec[1:, 1:])

<IPython.core.display.Javascript object>

In [87]:
Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
lower_right.scatter(X, Y)
top_histogram.hist(X, bins=100)
s = side_histogram.hist(Y, bins=100, orientation='horizontal')

In [88]:
# change axes limits
for ax in [top_histogram, lower_right]:
    ax.set_xlim(0, 1)
for ax in [side_histogram, lower_right]:
    ax.set_ylim(-5, 5)

The box extends from the lower to
upper quartile values of the data, with a line at the median.
The whiskers extend from the box to show the range of the
data.  Flier points are those past the end of the whiskers.

In [92]:
#Box plots -In descriptive statistics,
#a box plot or boxplot is a method for graphically depicting groups of numerical data through their quartiles.
#A quartile is a type of quantile which divides the number of data points into four more or less equal parts, or quarters.
#In statistics and probability "quantiles" are cut points dividing the range of a probability distribution
#into continuous intervals with equal probabilities,
#or dividing the observations in a sample in the same way.

#In probability theory and statistics,
#the gamma distribution is a two-parameter family of continuous probability distributions.

import pandas as pd
normal_sample = np.random.normal(loc=0.0, scale=1.0, size=10000)
random_sample = np.random.random(size=10000)
gamma_sample = np.random.gamma(2, size=10000)

df = pd.DataFrame({'normal': normal_sample, 
                   'random': random_sample, 
                   'gamma': gamma_sample})

In [89]:
df.describe()

Unnamed: 0,normal,random,gamma
count,10000.0,10000.0,10000.0
mean,-0.0029,0.493818,2.011968
std,1.017233,0.288873,1.424202
min,-3.996832,2.9e-05,0.007739
25%,-0.671568,0.242502,0.966017
50%,-0.004001,0.487094,1.683806
75%,0.671922,0.744171,2.685796
max,4.46037,0.99987,12.638036


In [90]:
plt.figure()
# create a boxplot of the normal data, assign the output to a variable to supress output
_ = plt.boxplot(df['normal'], whis='range')

<IPython.core.display.Javascript object>

In [94]:
# clear the current figure
plt.clf()
# plot boxplots for all three of df's columns
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ], whis='range')

In [52]:
plt.figure()
_ = plt.hist(df['gamma'], bins=100)

<IPython.core.display.Javascript object>

Multiply the interquartile range (IQR) by the number 1.5. Add 1.5 x (IQR) to the third quartile. Any number greater than this is a suspected outlier. Subtract 1.5 x (IQR) from the first quartile.

In [53]:
# if `whis` argument isn't passed, boxplot defaults to showing 1.5*interquartile (IQR) whiskers with outliers
plt.figure()
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ] )

<IPython.core.display.Javascript object>

In [64]:
plt.boxplot?

In [95]:
#Heatmaps
plt.figure()

Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
_ = plt.hist2d(X, Y, bins=25)

<IPython.core.display.Javascript object>

In [55]:
plt.figure()
_ = plt.hist2d(X, Y, bins=100)

<IPython.core.display.Javascript object>

In [56]:
# add a colorbar legend
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0xd5dc670>