# ***DATA 3500*** *: Python 4*
- Graphing and Visualizing Data.
    - Plotting simple graphs
    - Plotting multiple graphs on common axes
    - Creating axes within a figure
    - Creating subplots within a figure

In [None]:
# We will need to use matplotlib and interactive plotting.

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# We can now create a graph and view plots in our python notebook.

data = np.arange(10)
plt.plot(data)

In [None]:
# We can visualize changes in world population with a simple line chart.
# Within plt.plot(x, y) we specify which data goes on the x-axis and the y-axis.

year = [1950, 1970, 1990, 2010]
pop = [2.489, 3.122, 5.723, 7.124]

plt.plot(year, pop)

In [None]:
# We can also create a scatter plot.

plt.scatter(year, pop)

In [None]:
# Histrograms are another method for visualizing the data.
# Great method for visualizing the distribution of data.
# Remember you can see the possible options with help(plt.hist)

values = [0,0.6,1.4,1.6,2.2,2.5,2.6,3.2,3.5,3.9,4.2,6]
plt.hist(values, bins=5)

In [None]:
# We can also customize our plots to better convey our message.
# We can start with a basic plot and then add customization.

observation = np.arange(50)
growth = np.arange(50)**2
plt.plot(observation, growth)

In [None]:
# We can start by labeling our axis.

plt.xlabel('observation')
plt.ylabel('growth')
plt.plot(observation, growth)

In [None]:
# We can include a title as well.

plt.xlabel('observation')
plt.ylabel('growth')
plt.title('This shows exponential growth!')
plt.plot(observation, growth)

In [None]:
# We can now add different values on our axis.
# We can choose if we want them evenly spaced. We will evenly space the x-axis but not the y-axis.

plt.xlabel('observation')
plt.xticks([0,5,10,15,20,25,30,35,40,45,50])
plt.ylabel('growth')
plt.yticks([0,100,500,1000,1500,2500])
plt.title('This shows exponential growth!')
plt.plot(observation, growth)

In [None]:
# Maybe we want to change the label of our axis values.
# We have a very wealthy person and the y-axis represents the growth of their savings.

plt.xlabel('observation')
plt.xticks([0,5,10,15,20,25,30,35,40,45,50])
plt.ylabel('growth')
plt.yticks([0,100,500,1000,1500,2500],
          ['0 dollars', '100 dollars', '500 dollars', '1,000 dollars', '1,500 dollars', '2,500 dollars'])
plt.title('This shows exponential growth!')
plt.plot(observation, growth)

In [None]:
# We will focus on a dataframe and two columns, first the date which represents months.

weather = pd.read_csv('data/austin_weather.csv')
weather['DATE']

In [None]:
# We will also look at the average temperature column.

weather['MLY-TAVG-NORMAL']

In [None]:
# We can very easily specify which columns within a larger dataframe we want to plot.

plt.plot(weather['DATE'], weather['MLY-TAVG-NORMAL'])

In [None]:
# This weather is only measured once per month, but it is not clear from the above graph.
# We want to add a marker at the location of each actual data reading.

plt.plot(weather['DATE'], weather['MLY-TAVG-NORMAL'], marker='o')

In [None]:
# We can further change the appearance of the connecting lines.

plt.plot(weather['DATE'], weather['MLY-TAVG-NORMAL'], marker='o', linestyle='--')

In [None]:
# Or even eliminate the lines entirely.

plt.plot(weather['DATE'], weather['MLY-TAVG-NORMAL'], marker='o', linestyle='None')

In [None]:
# We can even choose the color of the plot and data.

plt.plot(weather['DATE'], weather['MLY-TAVG-NORMAL'], marker='o', linestyle='--', color='r')

In [None]:
# Customization extends beyond the actual data.
# We can customize the axis labels and title.

plt.title('Average weather in Austin')
plt.xlabel('Time (months)')
plt.ylabel('Temp (avg.)')
plt.plot(weather['DATE'], weather['MLY-TAVG-NORMAL'], marker='o', linestyle='--', color='r')

In [None]:
seattle = pd.read_csv('data/seattle_weather.csv')
seattle = seattle.loc[seattle['STATION'] == 'USC00456295']
seattle

In [None]:
# We can also create plots with multiple sets of data.
# fig holds the actual plot and ax holds all of our data.
# We will use this method of plotting for more complicated plots with multiple data inputs.

seattle = pd.read_csv('data/seattle_weather.csv')
seattle = seattle.loc[seattle['STATION'] == 'USC00456295']

fig = plt.figure()
ax = fig.add_subplot()

plt.title('Austin vs. Seattle')
plt.xlabel('Time (months)')
plt.ylabel('Temp (Avg.)')

ax.plot(weather['DATE'], weather['MLY-TAVG-NORMAL'],color='r',label='Austin')
ax.plot(seattle['DATE'], seattle['MLY-TAVG-NORMAL'],'--',color='b',label='Seattle')

ax.legend(loc='best')

In [None]:
# Lets make a more complicated graph of the Austin precipitation and see if it still makes sense.
# Plot the average, 25th and 75th percentile of precipitation in inches.

fig = plt.figure()
ax = fig.add_subplot()

plt.title('Austin Precipitation')
plt.xlabel('Time (months)')
plt.ylabel('Precipitation (inches)')

ax.plot(weather['DATE'], weather['MLY-PRCP-NORMAL'],color='r',label='Austin')
ax.plot(weather['DATE'], weather['MLY-PRCP-25PCTL'],'--',color='b',label='Austin 25pct')
ax.plot(weather['DATE'], weather['MLY-PRCP-75PCTL'],'--',color='b',label='Austin 75pct')

ax.legend(loc='best')

In [None]:
# Now let us compare this to Seattle.
# Can you still read the graph?

fig = plt.figure()
ax = fig.add_subplot()
plt.title('Austin v. Seattle Precipitation')
plt.xlabel('Time (months)')
plt.ylabel('Precipitation (inches)')

ax.plot(weather['DATE'], weather['MLY-PRCP-NORMAL'],color='b',label='Austin')
ax.plot(weather['DATE'], weather['MLY-PRCP-25PCTL'],'--',color='b',label='Austin 25pct')
ax.plot(weather['DATE'], weather['MLY-PRCP-75PCTL'],'--',color='b',label='Austin 75pct')

ax.plot(seattle['DATE'], seattle['MLY-PRCP-NORMAL'],color='r',label='Seattle')
ax.plot(seattle['DATE'], seattle['MLY-PRCP-25PCTL'],'--',color='r',label='Seattle 25pct')
ax.plot(seattle['DATE'], seattle['MLY-PRCP-75PCTL'],'--',color='r',label='Seattle 75pct')

ax.legend(loc='best')

In [None]:
# Instead of just one plot, we can use multiple small plots to show the data.
# This can sometimes make the data easier to read.
# We can create subplots with fig, ax = plt.add_subplots(rows, columns)

fig, ax = plt.subplots(2,1, sharey=True) # sharey=True ensures all plots have the same y-axis scale.

ax[0].plot(weather['DATE'], weather['MLY-PRCP-NORMAL'],color='b',label='Austin')
ax[0].plot(weather['DATE'], weather['MLY-PRCP-25PCTL'],'--',color='b',label='Austin 25pct')
ax[0].plot(weather['DATE'], weather['MLY-PRCP-75PCTL'],'--',color='b',label='Austin 75pct')

ax[1].plot(seattle['DATE'], seattle['MLY-PRCP-NORMAL'],color='r',label='Seattle')
ax[1].plot(seattle['DATE'], seattle['MLY-PRCP-25PCTL'],'--',color='r',label='Seattle 25pct')
ax[1].plot(seattle['DATE'], seattle['MLY-PRCP-75PCTL'],'--',color='r',label='Seattle 75pct')

ax[0].set_ylabel('Precipitation (inches)')
ax[1].set_ylabel('Precipitation (inches)')

ax[1].set_xlabel('Time (month)')
ax[0].set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
ax[0].set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax[1].set_xticks([1,2,3,4,5,6,7,8,9,10,11,12])
ax[1].set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax[0].set_title('Austin Rainfall')
ax[1].set_title('Seattle Rainfall')

plt.subplots_adjust(hspace=.5)

## Try Problem 1

In [None]:
# We will now work with a pollution dataset.

pollution = pd.read_csv('data/pollution_wide.csv')
pollution.sample(10)

In [None]:
# This is a large dataset that we cannot possibly understand from the numerical values alone.
# We also cannot just plot all 9000 points and understand the data.

pollution.info()

In [None]:
# Let us focus on highlighting specific data to draw attention to it.
# We will utilize seaborn for this set of plotting. It is similar to matplotlib only with more functionality.

import seaborn as sns

cinci_pollution = pollution[pollution['city'] == 'Cincinnati']

# Make an array of colors based upon if a row is a given day.
cinci_colors = ['orangered' if day == 38 else 'steelblue' for day in cinci_pollution['day']]

# Create the new multicolored plot.
p = sns.regplot(x='NO2',
                y='SO2',
                data=cinci_pollution,
                fit_reg=False,
                scatter_kws={'facecolors':cinci_colors,'alpha':0.3})

In [None]:
# How can we compare two classes in more detail than what we have done above?
# Let us examine the beeswarm plot for pollution during the month of november.

pollution_nov = pollution[pollution['month'] == 10]
sns.swarmplot(y='city', x='O3', data=pollution_nov)
plt.xlabel('Ozone')

In [None]:
# We can also annotate our data to draw attention to a particular point.

houston_pollution = pollution[pollution['city'] == 'Houston']

sns.scatterplot(x='NO2', y='SO2', data=houston_pollution)

plt.text(13,33, 'Look at this outlier', fontdict={'ha':'left', 'size':'x-large'})

In [None]:
# We can also draw attention to a point in the middle of our data.

sns.scatterplot(x='NO2', y='SO2', data=houston_pollution)

plt.annotate('A buried point to look at', xy=(45.5, 11.8), xytext=(60, 22), 
             arrowprops={'facecolor':'grey', 'width':3}, backgroundcolor='white')

In [None]:
# We can utilize color to make a visualization easier to understand.
   
indy_oct = pollution.loc[(pollution['year'] == 2015) & (pollution['city'] == 'Indianapolis')]
blue_scale = sns.light_palette('steelblue', as_cmap=True)
sns.heatmap(indy_oct[['O3']], cmap=blue_scale)

In [None]:
# Making use of color can improve or hurt readability...be careful!

indy_oct = pollution.loc[(pollution['year'] == 2015) & (pollution['city'] == 'Indianapolis')]
jet_scale  = sns.color_palette('Paired')
sns.heatmap(indy_oct[['O3']], cmap=jet_scale)

In [None]:
# Plotting categorical values is different than continuous data.
# Be creative when you have many categorical values. Too many can become hard to view.

pollution['city'].unique()

In [None]:
pollution['interesting_city'] = [x if x in ['Denver', 'Houston'] else 'other' for x in pollution['city']]
sns.scatterplot(x='NO2', y='SO2', hue='interesting_city', palette='Set2', data=pollution.query('year == 2014 & month == 12'))


In [None]:
# We can allow seaborn to assign the correct number of colors to our data as well.

pollution['NO2 Tercile'] = pd.qcut(pollution['NO2'], 3, labels=False)
sns.scatterplot(x='CO', y='SO2', hue='NO2 Tercile', palette='Paired', data=pollution.query("city == 'Long Beach' & year == 2014"))

### **Try problem #2**

### **Try problem #3**

In [None]:
# Additional categorical visualization techniques are available.
# Stripplots show all data points but that can be difficult with many observations.

df = pd.read_csv('data/pollution_wide.csv')
sns.stripplot(data=df, y='city', x = 'CO', jitter=True)

In [None]:
# A swarmplot shows all datapoints again, but in a more sophistocated view.
# You now have no overlapping observations. This does not scale well to large datasets.

sns.swarmplot(data=df, y='city', x='CO')

In [None]:
# A boxplot represents the data in an abstract (summarized) view.
# Displays the median, upper quartile, lower quartile, and possible outliers.

sns.boxplot(data=df, y='city', x='CO')

In [None]:
# Violin plots convey similar data to the boxplot, but present the data in another format.

sns.violinplot(data=df, y='city', x='CO')

In [None]:
#Finally, the boxen plot is a hybrid of the box plot and the violin plot.

sns.boxenplot(data=df, y='city', x='CO')

In [None]:
# We will now work through the pollutions dataset again.

pol = pd.read_csv('data/pollution_wide.csv')
pol.sample(5)

In [None]:
# We will first focus on numeric columns to look at the relationships.

numeric_columns = ['CO', 'NO2', 'O3', 'SO2']
pd.plotting.scatter_matrix(pol[numeric_columns], figsize = [15,10], alpha = 0.5);

In [None]:
# We will now dig deeper into these relationships.
# We can use a linear regression line to observe the relationship.

sns.regplot('CO','NO2', data=pol, scatter_kws={'alpha':0.2, 'color':'grey'})

In [None]:
# We can be more efficient by combining plots into one output.

fig, (ax1, ax2) = plt.subplots(1,2) # one row, two columns
sns.lineplot('month','NO2','year', ax=ax1, data=pol, palette='RdBu')
sns.barplot('month', 'CO', 'year', ax=ax2, data=pol, palette='RdBu')

In [None]:
house = pd.read_csv('data/train.csv')
house.sample(15)

In [None]:
numeric_features = house.select_dtypes(include=[np.number])
numeric_features.columns

In [None]:
categorical_features = house.select_dtypes(include=[np.object])
categorical_features.columns

In [None]:
# Correlation Matrix

correlation = numeric_features.corr()
f, ax = plt.subplots(figsize = (14, 12))
plt.title('Correlation of Numeric Features', y=1, size=16)
sns.heatmap(correlation, square=True, vmax=0.8)

In [None]:
house['OverallQual'].head()

In [None]:
# What variables are related to sale price?

var = 'OverallQual'
data = pd.concat([house['SalePrice'], house[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y='SalePrice', data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
var = 'Neighborhood'
data = pd.concat([house['SalePrice'], house[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 10))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
xt = plt.xticks(rotation=45)

In [None]:
house.head()

In [None]:
# Plotting with subgroups (ie. groupby)
# We will plot features with missing values vs. Sales Price for greater insights.
# We first create a list of features that contain missing values.

features_with_na = [features for features in house.columns if house[features].isnull().sum()>1]
features_with_na

In [None]:
# Create a categorical variable for null values to be used in the visualization.

house['LotFrontage_zero'] = np.where(house['LotFrontage'].isnull(),1,0)
house.groupby('LotFrontage_zero')['SalePrice'].median().plot.bar()

In [None]:
# We can do this for all features with a for loop.

for feature in features_with_na:
    dataset = house.copy()
    dataset[feature] = np.where(dataset[feature].isnull(),1,0)
    dataset.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.show()

In [None]:
# We can examine the relationship between categorical variables and Sales Price too.
# We will again use a groupby statement.

for feature in categorical_features:
    data=house.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

In [None]:
houseMini = house[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 
               'YearBuilt', 'KitchenQual', 'Alley', 'Neighborhood', 'SalePrice']]
houseMini.head()

### **Try problem #4**