In [1]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import nbinteract as nbi

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# HIDDEN
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

# Section 4

## Merging

In [3]:
df_steps = pd.read_csv('data/age_steps.csv')
df_income = pd.read_json('data/incomes.json')

In [4]:
df_steps.shape

(11850, 3)

In [5]:
df_income.shape

(13332, 4)

In [6]:
df = pd.merge(df_income, df_steps, on='id')
df

Unnamed: 0,first_name,id,income,last_name,age,steps
0,Brian,84764,99807.16,Wolfe,41,8622
1,George,49337,0.00,Keith,31,9870
2,Catherine,41693,0.00,Glass,37,-1
...,...,...,...,...,...,...
10661,Ian,68473,7617.27,Meyer,40,7895
10662,Carl,60486,34479.99,Russell,49,6004
10663,Curtis,13915,12133.79,Johnson,39,7180


In [7]:
small_steps = df_steps.query('100 < id < 140').sort_values('id')
small_steps

Unnamed: 0,id,age,steps
9412,101,34,7255
8121,102,47,6239
9993,103,37,8342
5575,115,35,7202
46,124,45,7145
5908,127,34,9194
8025,134,47,8145


In [8]:
small_incomes = df_income.query('100 < id < 130').sort_values('id')
small_incomes

Unnamed: 0,first_name,id,income,last_name
11132,Wendy,101,0.0,Barnes
8953,Ryan,102,11761.22,Compton
2107,John,106,7734.47,Harrison
9425,Julie,115,24894.4,Smith
2671,Gregory,124,11292.59,Erickson
10311,Karen,127,93539.72,Henson


In [10]:
pd.merge(small_steps, small_incomes, on='id', how='outer')

Unnamed: 0,id,age,steps,first_name,income,last_name
0,101,34.0,7255.0,Wendy,0.00,Barnes
1,102,47.0,6239.0,Ryan,11761.22,Compton
2,103,37.0,8342.0,,,
...,...,...,...,...,...,...
5,127,34.0,9194.0,Karen,93539.72,Henson
6,134,47.0,8145.0,,,
7,106,,,John,7734.47,Harrison


## Mean with missing values

In [11]:
small_steps

Unnamed: 0,id,age,steps
9412,101,34,7255
8121,102,47,6239
9993,103,37,8342
5575,115,35,7202
46,124,45,7145
5908,127,34,9194
8025,134,47,8145


In [12]:
small_steps['steps'].mean()

7646.0

In [13]:
(small_steps
 .assign(steps=[10, np.nan, 20, np.nan, 10, np.nan, 20])
 ['steps']
 .mean()
)

15.0

## Not missing at random

In [24]:
small_incomes

Unnamed: 0,first_name,id,income,last_name
11132,Wendy,101,0.0,Barnes
8953,Ryan,102,11761.22,Compton
2107,John,106,7734.47,Harrison
9425,Julie,115,24894.4,Smith
2671,Gregory,124,11292.59,Erickson
10311,Karen,127,93539.72,Henson


In [25]:
small_incomes['income'].mean()

24870.399999999998

In [62]:
incomes = small_incomes['income'].reset_index(drop=True)
to_na = np.random.choice(2, size=len(incomes))
incomes[to_na == 1] = np.nan
incomes

0         NaN
1    11761.22
2     7734.47
3    24894.40
4         NaN
5    93539.72
Name: income, dtype: float64

## Predictions

In [14]:
df = (
    df.dropna()
    .query('steps != -1')
    .query('age >= 18')
)
df

Unnamed: 0,first_name,id,income,last_name,age,steps
0,Brian,84764,99807.16,Wolfe,41,8622
1,George,49337,0.00,Keith,31,9870
3,Bob,98170,18077.78,Perez,34,6987
...,...,...,...,...,...,...
10661,Ian,68473,7617.27,Meyer,40,7895
10662,Carl,60486,34479.99,Russell,49,6004
10663,Curtis,13915,12133.79,Johnson,39,7180


In [15]:
a1, b1 = np.polyfit(df['age'], df['income'], 1)
a1, b1

(1074.1595347586629, -18039.20184282822)

In [16]:
age = 75

pred_75 = a1 * age + b1
pred_75

62522.7632640715

In [19]:
age = np.arange(18, 81)
a1 * age + b1

array([ 1295.67,  2369.83,  3443.99, ..., 65745.24, 66819.4 , 67893.56])

In [17]:
pred_age = []
for age in range(18, 80):
    pred_age.append(a1 * age + b1)
pred_age[:5]

[1295.6697828277102,
 2369.829317586373,
 3443.9888523450354,
 4518.148387103698,
 5592.307921862361]

In [160]:
age = np.arange(18, 81)

pred_age = a1 * age + b1
pred_age

array([ 1295.67,  2369.83,  3443.99, ..., 65745.24, 66819.4 , 67893.56])

In [158]:
np.append(age, 2)

array([18, 19, 20, ..., 79, 80,  2])

In [159]:
age

array([18, 19, 20, ..., 78, 79, 80])

In [None]:

plt.scatter(...)

plt.plot(...)
plt.title()
plt.xlim()






In [None]:
plt.ylim()

In [153]:
%%timeit

# Here's a slow solution:

pred_age = []
for age in range(18, 81):
    pred_age.append(a1 * age + b1)
pred_age[:5]

31.9 µs ± 2.75 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [154]:
%%timeit

# Here's a nicer solution:

pred_age = a1 * np.arange(18, 81) + b1
pred_age

4.83 µs ± 567 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## License Plates

In [26]:
def set_cols(df):
    df.columns = ['plate', 'time', 'location']
    return df

def clean_location(df):
    locations = (df['location']
     .str.strip('()')
     .str.split(', ', expand=True)
     .rename(columns={0: 'lat', 1: 'lon'})
     .assign(lat=lambda df: df['lat'].astype(float))
     .assign(lon=lambda df: df['lon'].astype(float))
    )
    return (df.assign(lat=locations['lat'], lon=locations['lon'])
            .drop(columns=['location']))

def convert_times(df):
    return df.assign(time=pd.to_datetime(df['time'],
                                         format='%m/%d/%Y %I:%M:%S %p'))

def colorize(df):
    def color(t):
        if t.weekday() >= 6:
            return 'green' # Weekend
        if t.hour >= 6 and t.hour <= 17:
            return 'blue' # Weekday daytime
        return 'red' # Weekday evening
    return df.assign(color=df['time'].apply(color))

In [27]:
plates = (pd.read_csv('data/all-lprs.csv.gz')
          .pipe(set_cols)
          .pipe(clean_location)
          .pipe(convert_times)
          .pipe(colorize))
plates

Unnamed: 0,plate,time,lat,lon,color
0,1275226,2011-01-19 02:06:00,37.8,-122.28,red
1,27529C,2011-01-19 02:06:00,37.8,-122.28,red
2,1158423,2011-01-19 02:06:00,37.8,-122.28,red
...,...,...,...,...,...
2742098,5X10319,2013-12-19 20:28:00,37.8,-122.28,red
2742099,7D56240,2013-12-19 20:28:00,37.8,-122.28,red
2742100,6JNM127,2013-12-19 20:28:00,37.8,-122.28,red


In [28]:
import folium
def map_for_license(license):
    m = folium.Map(location=[37.798, -122.276], zoom_start=15)
    for row in plates.loc[plates['plate'] == license].itertuples():
        folium.Marker(
            location=[row.lat, row.lon],
            popup=f'{row.time}',
            icon=folium.Icon(color=row.color)
        ).add_to(m)
    return m

### Jean Quan

In [29]:
map_for_license('6FCH845')

### Fire Chief

In [30]:
map_for_license('1328354')

### Random People

In [31]:
map_for_license('5AJG153')

In [32]:
map_for_license('6UZA652')