# Exploratory Data Analysis
Demo from lecture 2 of [ORIE 4741](https://people.orie.cornell.edu/mru8/orie4741)

In [None]:
using DataFrames, Gadfly

# Load data

The data is available on the course website, and is about 200MB.

If you point your browser to the url below, it will be downloaded to your standard download folder. On Mac, you can access it at /Users/yourusername/Downloads/acs_2013.csv.
Or, if you've synced or cloned the whole demos GitHub repo, you can access the truncated version acs_2013_trunc.csv locally without doing anything. (Try the readtable command below to test it out.)

In [None]:
url = "https://people.orie.cornell.edu/mru8/orie4741/data/acs_2013.csv"

In [None]:
acs = readtable("acs_2013_trunc.csv", nrows=10000);

# Summary statistics

In [None]:
size(acs)

In [None]:
income = acs[:HHINCOME];

In [None]:
median(income)

In [None]:
std(income)

In [None]:
maximum(income)

In [None]:
# Plotting

In [None]:
"""compute summary statistics on observed values in data array"""
function summary_stats(arr::DataArray)
    clean_arr = dropna(arr)
    print("missing: $(mean(isna(arr)))%\n")
    print("maximum: $(maximum(clean_arr))\n")
    print("minimum: $(minimum(clean_arr))\n")
    print("median:  $(median(clean_arr))\n")
    print("mean:    $(mean(clean_arr))\n")
    print("std:     $(std(clean_arr))\n")
end

In [None]:
?summary_stats

In [None]:
summary_stats(income)

In [None]:
plot(acs, x=:HHINCOME, Geom.histogram)

In [None]:
rows_with_99percent_income = acs[:HHINCOME].<4e6
acs99 = acs[rows_with_99percent_income,:];

In [None]:
plot(acs99, x=:HHINCOME, Geom.histogram)

In [None]:
plot(acs99, x=:HHINCOME, color=:FOODSTMP, Geom.histogram(position=:stack))

# Now with recoded data

In [None]:
acs = readtable("acs_2013_cleaned.csv", nrows=10000);

In [None]:
summary_stats(acs[:HHINCOME])

In [None]:
plot(acs, x=:HHINCOME, Geom.histogram)

In [None]:
import DataArrays.dropna
"""drop all rows of a data frame with a NA in column named s"""
function dropna(df::DataFrame, s::Symbol)
    return df[!isna(acs[:,s]),:]
end
function dropna(df::DataFrame, ss::Symbol...)
    keep_rows = fill(true, size(df,1))
    for s in ss
        keep_rows = keep_rows & !isna(df[:,s])
    end
    return df[keep_rows,:]
end

Look at income distribution by :FOODSTMP, :LABFORCE, :OWNERSHP, ...

In [None]:
plot(dropna(acs, :LABFORCE), x=:HHINCOME, color=:LABFORCE, Geom.histogram)

In [None]:
plot(dropna(acs, :FOODSTMP, :HHINCOME)[1:300,:], y=:HHINCOME, x=:FOODSTMP, Geom.boxplot)

In [None]:
plot(dropna(acs, :EDUC, :HHINCOME)[1:800,:], y=:HHINCOME, x=:EDUC, Geom.point)

In [None]:
plot(dropna(acs, :EDUC, :HHINCOME)[1:800,:], y=:HHINCOME, x=:EDUC, Geom.histogram2d(ybincount=20))

# More interesting ploting tools: 

* [Gadfly documentation](http://dcjones.github.io/Gadfly.jl)
* [ggplot2](http://ggplot2.org/) in R
* [MatPlotLib](http://matplotlib.org/) in Python or Julia
* [D3](https://d3js.org/) in JavaScript