In [None]:
%matplotlib inline

In [None]:
import os
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from patsy import dmatrices
import matplotlib.pyplot as plt

In [None]:
!pip install xlrd

In [None]:
DATADIR = os.path.join(os.path.expanduser("~"), "DATA", "TimeSeries", "EPA")
os.path.exists(DATADIR)

In [None]:
files = os.listdir(DATADIR)
files

## Read the air quality data

In [None]:
slc = pd.read_excel(os.path.join(DATADIR, 'Salt_Lake_2016_PM25.xlsx'))
print(slc.columns)
slc.head()

In [None]:
d1 = slc["Date Local"][0]
t1 = slc["Time Local"][0]
print("Date Local type",type(d1),"Time Local type",type(t1))
datetime.datetime.combine(d1.to_datetime(), t1)


### Comments:

* Dates and times are split into separate columns
* We have both local time and UTC time 

## Merging Dates and Time
* After we have read in the data we could combine the dates and times
    * Convert pandas Timestamp to datetime
    * Use ``datetime.combine`` to merge with ``datetime.time``
* We can also merge these when reading the data by specifying the ``parse_dates`` keyword

#### Using ``datetime.combine``

In [None]:
slc.apply(lambda x: datetime.datetime.combine(x["Date Local"],x["Time Local"]), axis=1).head()

#### using ``parse_dates``

In [None]:
pd.read_excel(os.path.join(DATADIR, 'Salt_Lake_2016_PM25.xlsx'), 
              parse_dates=[["Time Local", "Date Local"],
                           ['Date GMT', 'Time GMT']]).head()


In [None]:
sd = pd.read_excel(os.path.join(DATADIR, "San_Diego_2016_PM25.xlsx"),
                   parse_dates=[["Time Local", "Date Local"],
                           ['Date GMT', 'Time GMT']])
sd.head()

## Read in asthma data


In [None]:
slc_asthma = pd.read_excel(os.path.join(DATADIR, 'Asthma_SLC.xlsx'))
print(type(slc_asthma["Day"][0]))
slc_asthma.head()

## Read in weather data

In [None]:
slc_weather = pd.read_excel(os.path.join(DATADIR, 'SLC_Weather_2016.xlsx'))
slc_weather.head()

In [None]:
slc_weather = pd.read_excel(os.path.join(DATADIR, 'SLC_Weather_2016.xlsx'), skiprows=[1])
slc_weather.head()

In [None]:
slc_weather.columns

In [None]:
slc_weather['Day'][0]

## Our Weather and Asthma Data Have Resolution of Days
## Our pollutant data has resolution of hours
## Group and take mean?

In [None]:
slc.groupby("Date Local", as_index=False).aggregate(np.mean)

### Group and take sum?

In [None]:
slc.groupby("Date Local", as_index=False).aggregate(np.sum)

### Applying different functions to different columns

In [None]:
slc_day = slc.groupby(["Date Local", "Site Num"], as_index=False).aggregate({'Sample Measurement' : np.mean,
                                                                   'MDL': np.median})
slc_day.head()

In [None]:
slc_day_all = slc_day.merge(slc_asthma, 
              left_on="Date Local", 
              right_on="Day").merge(slc_weather, left_on="Date Local", right_on="Day")
slc_day_all.head()

In [None]:
f, ax1 = plt.subplots(1)
slc_day_all[slc_day_all["Site Num"]==3006].plot(x="Date Local", 
                                                y="Number of Asthma Diagnosis", ax=ax1)
slc_day_all[slc_day_all["Site Num"]==3006].plot(secondary_y=True, x="Date Local", 
                                                y="Sample Measurement", ax=ax1)

In [None]:
f, ax1 = plt.subplots(1)
slc_day_all[slc_day_all["Site Num"]!=3006].plot(x="Date Local", 
                                                y="Number of Asthma Diagnosis", ax=ax1)
slc_day_all[slc_day_all["Site Num"]!=3006].plot(secondary_y=True, x="Date Local", 
                                                y="Sample Measurement", ax=ax1)

In [None]:
slc_day_all[slc_day_all["Site Num"]==3006].plot.scatter(x="Sample Measurement", y="Number of Asthma Diagnosis")

### Rename columns to eliminate spaces

In [None]:
{c:c.replace(" ", "_") for c in slc_day_all.columns}

In [None]:
slc_final = slc_day_all.rename(columns={c:c.replace(" ", "_") for c in slc_day_all.columns})

## Creating a Regression Model

In [None]:
sns.regplot(data=slc_final, x="Sample_Measurement", y="Number_of_Asthma_Diagnosis")

In [None]:
y, X = dmatrices('Number_of_Asthma_Diagnosis ~ Sample_Measurement + High', data=slc_final, return_type='dataframe')

In [None]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

In [None]:
sm.stats.linear_rainbow(res)

In [None]:
sm.graphics.plot_partregress('Number_of_Asthma_Diagnosis', 'Sample_Measurement', ['High', "Low"],
                              data=slc_final, obs_labels=False)