In [None]:
# This is Luke's scratchpad notebook.

In [32]:
# Importing the libraries:
import pandas as pd
import numpy as np
import math
from scipy import stats

# visualizing
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# plt.rc('figure', figsize=(13, 10))
# plt.rc('font', size=14)

# preparing
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# modeling and evaluating
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix

# turn off warnings
import warnings
warnings.filterwarnings("ignore")

# acquiring
from pydataset import data

In [2]:
# SVI doesn't have zip codes, it has fips. So I need to transform FIPS into zip codes, and *then* I can join on the Covid cases onto the data frame

#### Key datasource:
https://www.huduser.gov/portal/datasets/usps_crosswalk.html
Possible note for presentation comments:
- we downloaded the 4th quarter 2018 data Tact-to-zip file, which is the file from HUD that'll translate the data

### Data Prep

In [3]:
import acquire
import prepare

In [4]:
import wrangle

In [5]:
df = acquire.run()

Acquire: compiling raw data files...
Acquire: Completed!


In [6]:
df.head()

Unnamed: 0,st,state,st_abbr,stcnty,county,tract,location,area_sqmi,e_totpop,m_totpop,...,e_uninsur,m_uninsur,ep_uninsur,mp_uninsur,e_daypop,zip,address_ratio,population,positive,casesp100000
0,48,TEXAS,TX,48029,Bexar,48029110100,"Census Tract 1101, Bexar County, Texas",1.364296,3102,365,...,272,130,11.3,5.0,38328,78205,0.52,1633,63,3857.93019
1,48,TEXAS,TX,48029,Bexar,48029110300,"Census Tract 1103, Bexar County, Texas",0.598444,3023,309,...,606,206,20.0,5.7,9202,78210,0.79,39242,1583,4033.943224
2,48,TEXAS,TX,48029,Bexar,48029110500,"Census Tract 1105, Bexar County, Texas",0.44881,2388,243,...,465,125,19.5,4.4,2044,78207,1.0,58019,3205,5524.052466
3,48,TEXAS,TX,48029,Bexar,48029110600,"Census Tract 1106, Bexar County, Texas",0.758136,5301,486,...,690,265,24.5,7.4,6776,78207,1.0,58019,3205,5524.052466
4,48,TEXAS,TX,48029,Bexar,48029110700,"Census Tract 1107, Bexar County, Texas",0.379005,1114,208,...,251,128,22.5,8.8,3716,78212,0.92,29916,966,3229.041316


In [7]:
df = prepare.prepare_data(df)

In [8]:
df.head()

Unnamed: 0,tract,raw_svi,f_pov_soci,f_unemp_soci,f_pci_soci,f_nohsdp_soci,f_soci_total,f_age65_comp,f_age17_comp,f_disabl_comp,...,f_mobile_trans,f_crowd_trans,f_noveh_trans,f_groupq_trans,f_trans_total,all_flags_total,zip,tract_cases_per_100k,bin_svi,rank_svi
0,48029110100,0.6503,0,0,0,0,0,0,0,0,...,0,0,1,1,3,3,78205,2006.123699,mod_high,2
1,48029110300,0.6978,1,1,0,0,2,0,0,0,...,0,0,1,0,2,4,78210,3186.815147,mod_high,2
2,48029110500,0.977,1,1,1,1,4,0,1,1,...,0,0,1,0,1,9,78207,5524.052466,high,1
3,48029110600,0.9841,1,1,1,1,4,0,0,1,...,0,0,1,1,3,8,78207,5524.052466,high,1
4,48029110700,0.9378,0,0,1,0,1,1,0,0,...,0,0,1,0,1,3,78212,2970.718011,high,1


# Moving into Explore

- Where are the clusters of cases visually?
- Where are the most dense clusters?
- How to define dense vs not dense?
- What are some other visually interesting things?
    - Scatterplot of all cases?
    - mapplot of svi by census tract?

In [9]:
df, train_exp, X_train_scaled, y_train, X_test_scaled, y_test = wrangle.wrangle_data()

Acquire: compiling raw data files...
Acquire: Completed!
Prepare: preparing data files...
Prepare: Completed!
(289, 25) (73, 25)


In [10]:
train_exp.head()

Unnamed: 0,tract,raw_svi,f_pov_soci,f_unemp_soci,f_pci_soci,f_nohsdp_soci,f_soci_total,f_age65_comp,f_age17_comp,f_disabl_comp,...,f_mobile_trans,f_crowd_trans,f_noveh_trans,f_groupq_trans,f_trans_total,all_flags_total,zip,tract_cases_per_100k,bin_svi,rank_svi
289,48029181821,0.4879,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,78240,2771.002224,low_mod,3
200,48029171902,0.6394,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,78245,2924.011251,mod_high,2
69,48029130200,0.6523,0,0,0,0,0,0,0,0,...,0,0,1,0,1,2,78203,4177.649457,mod_high,2
29,48029121120,0.0872,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,78247,2222.089837,low,4
12,48029120502,0.9531,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,78218,3235.404493,high,1


In [11]:
# do a bin of how many flags == 1 per tract

In [12]:
# Get ideas from zillow project, linear regression explore lesson, 

### Cross Validation

In [14]:
# loocv to manually evaluate the performance of a random forest classifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [15]:

# create dataset
X, y = make_blobs(n_samples=100, random_state=1)
# create loocv procedure
cv = LeaveOneOut()
# enumerate splits
y_true, y_pred = list(), list()
for train_ix, test_ix in cv.split(X):
	# split data
	X_train, X_test = X[train_ix, :], X[test_ix, :]
	y_train, y_test = y[train_ix], y[test_ix]
	# fit model
	model = RandomForestClassifier(random_state=1)
	model.fit(X_train, y_train)
	# evaluate model
	yhat = model.predict(X_test)
	# store
	y_true.append(y_test[0])
	y_pred.append(yhat[0])
# calculate accuracy
acc = accuracy_score(y_true, y_pred)
print('Accuracy: %.3f' % acc)

Accuracy: 0.990


In [16]:
# loocv to automatically evaluate the performance of a random forest classifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_blobs
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
# create dataset
X, y = make_blobs(n_samples=100, random_state=1)
# create loocv procedure
cv = LeaveOneOut()
# create model
model = RandomForestClassifier(random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.990 (0.099)


In [17]:
X

array([[-7.94152277e-01,  2.10495117e+00],
       [-9.15155186e+00, -4.81286449e+00],
       [-1.14418263e+01, -4.45781441e+00],
       [-9.76761777e+00, -3.19133737e+00],
       [-4.53655648e+00, -8.40186288e+00],
       [-6.26302115e+00, -8.10666081e+00],
       [-6.38481234e+00, -8.47302970e+00],
       [-9.20490564e+00, -4.57687928e+00],
       [-2.76017908e+00,  5.55121358e+00],
       [-1.17104176e+00,  4.33091816e+00],
       [-1.00364080e+01, -5.56912090e+00],
       [-9.87589123e+00, -2.82386464e+00],
       [-7.17532921e+00, -8.77059017e+00],
       [-2.40671820e+00,  6.09894447e+00],
       [-4.87418245e+00, -1.00495890e+01],
       [-6.07854700e+00, -7.93969420e+00],
       [-6.83238762e+00, -7.47067670e+00],
       [-2.34673261e+00,  3.56128423e+00],
       [-1.03415662e+01, -3.90975169e+00],
       [-1.10926243e+01, -3.78396611e+00],
       [-6.50212109e+00, -7.91249101e+00],
       [-1.02639310e+01, -3.92073400e+00],
       [-6.81608302e+00, -8.44986926e+00],
       [-1.

In [19]:
# Sonar Example:

# summarize the sonar dataset
from pandas import read_csv
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
sonar_df = read_csv(url, header=None)
# split into input and output elements
data = sonar_df.values
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)

(208, 60) (208,)


In [20]:
y

array(['R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M

In [21]:
# create loocv procedure
cv = LeaveOneOut()

In [22]:
# create model
model = RandomForestClassifier(random_state=1)

In [23]:
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.822 (0.382)
