# Intro to Plotting with MatPlotLib

Plotting data is an essential skill to have when embarking on a Data Science project. It allows us to visually represent our data to make important decisions about how we will perform analysis, as well as giving us the ability to clearly communicate any findings.

While there are a number of Python libraries to perform plotting (and we will use many of them), MatPlotLib is one of the oldest and most commonly seen - so let's dive in.

In [None]:
# We'll use numpy to generate data for demonstration purposes
# And matplotlib's  pyplot module to create graphs
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Line Graphs

In [None]:
# Let's generate an array's of point values
x = np.linspace(-3.1459, 3.1459, 20)

# And two more arrays with the sin and cosine of each point
y = np.sin(x)
z = np.cos(x)

print(x)
print(y)
print(z)

In [None]:
# Plotting lines is easy, just pass your x-axis values, followed by your y-axis values to the plot method
print("sin(x)")
plt.plot(x, y)
plt.show()

print("cos(x)")
plt.plot(x, z)
plt.show()

In [None]:
# Or, on a single plot, using labels
plt.plot(x, y, label="sin(x)")
plt.plot(x, z, label="cos(x)")

plt.legend(loc="upper left")

plt.show()

### Scatter Plots

In [None]:
# Let's generate 100 random x values and 100 random y values
x = np.random.random(100)
y = np.random.random(100)

print(x[:10])
print(y[:10])

In [None]:
# Just like before, but this time use the scatter method
plt.scatter(x, y)

# Lets add labels too
plt.xlabel("x")
plt.ylabel("y")

# And a title
plt.title("Scatter Plot")

plt.show()

### Basic Linear Regression Example

In [None]:
# We have a text file that shows the brain weight v. body weight for a number of mammals
data_fp = 'data/brain2body.txt'

# Use the np.loadtxt method to unpack the columns into arrays
x, y = np.loadtxt(data_fp, delimiter=',', unpack=True)
print(x[:10])
print(y[:10])

In [None]:
# Let's use polyfit to calculate a "best-fit" line of regression
slope, intercept = np.polyfit(x, y, 1)
reg_line = [slope*x_point + intercept for x_point in x]

# Scatter our weight values and add some labels
plt.scatter(x, y)
plt.xlabel('Brain Weight')
plt.ylabel('Body Weight')

# Add our regression line
plt.plot(x, reg_line, 'g', label='Line of Reg.')

plt.title('Mammals\' Brain to Body Weight')
plt.legend(loc='lower right')
plt.show()

### Time Series

In [None]:
# We have a CSV file showing median listing prices for homes in Richmond County
# Let's import a couple new modules to help with time series data
from matplotlib.mlab import csv2rec
from matplotlib.ticker import Formatter

# csv2rec is awesome!
records = csv2rec('data/richmond_median_list_prices.csv')
records[:3]

In [None]:
# Since we're dealing with timestamps, we need to specify a formatter object which
# will inform plt how to diplay the ticks on our chart.
class OurFormatter(Formatter):
    
    def __init__(self, dates, fmt='%Y-%m-%d'):
        self.dates = dates
        self.fmt = fmt
        
    def __call__(self, x, pos=0):
        ind = int(np.round(x))
        if ind >= len(self.dates) or ind < 0:
            return ''
        return self.dates[ind].strftime(self.fmt)

In [None]:
# Let's create an instance of OurFormatter
formatter = OurFormatter(records.date)

# And instead of using plt directly, let's use the subplots method to return a Figure object and an AxesSubplot object
# using the figsize argument to make sure out graph is larger
fig, ax = plt.subplots(figsize=(10,10))

# We'll set the x-axis of ax to use the formatter
ax.xaxis.set_major_formatter(formatter)

# And start plotting our data
ax.plot(np.arange(len(records)), records.value2b, label='2 Bedrooms')
ax.plot(np.arange(len(records)), records.value3b, label='3 Bedrooms')
ax.plot(np.arange(len(records)), records.value4b, label='4 Bedrooms')
ax.plot(np.arange(len(records)), records.value5b, label='5 Bedrooms')

# Add a legend
ax.legend()

# Now, let's call the autofmt_xdate() method on our figure, and show the plot
fig.autofmt_xdate()

plt.show()