# Cleaning California Data From USA.com

In [1]:
# Import Dependencies
import json
import pandas as pd
import numpy as np
import glob
import csv
import datetime
from sqlalchemy import create_engine
import psycopg2
from config import db_password, ACCESS_ID, ACCESS_KEY
import logging
from io import StringIO
import boto3
from botocore.exceptions import ClientError

In [2]:
# path = r'../Resources1'
# filenames = glob.glob(path + '/*.csv')
# dfs = []
# for filename in filenames:
#     df=pd.read_csv(filename, index_col='zip', header=0)
#     dfs.append(df)
    
# california_df = pd.concat(dfs, axis= 0, ignore_index=True)
# california_df.drop('Unnamed: 0', axis=1)
# california_df.head()

## Average Education Index

In [3]:
# ETL average education 2000
avg_edu_2000 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/avg_edu_2000.csv')
# Drop Unnamed: 0
avg_edu_2000 = avg_edu_2000.drop('Unnamed: 0', axis=1)
# Set zip as the index key
avg_edu_2000.set_index('zip', inplace=True)
# Set the values as a float 
avg_edu_2000 = avg_edu_2000.astype(float)
avg_edu_2000.head()

Unnamed: 0_level_0,avg_edu_2000
zip,Unnamed: 1_level_1
94305,17.76
95041,17.12
94708,17.01
94304,17.0
94707,16.83


In [4]:
# ETL average education 2014
avg_edu_2014 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/avg_edu_2014.csv')
# Drop Unnamed: 0
avg_edu_2014 = avg_edu_2014.drop('Unnamed: 0', axis=1)
# Set zip as the index key
avg_edu_2014.set_index('zip', inplace=True)
# Set the values as a float 
avg_edu_2014 = avg_edu_2014.astype(float)
avg_edu_2014.head()

Unnamed: 0_level_0,avg_edu_2014
zip,Unnamed: 1_level_1
95463,19.0
95721,19.0
94305,17.69
95736,17.68
92617,17.38


In [5]:
# Concat the two average education datasets
avg_edu = pd.concat([avg_edu_2000,avg_edu_2014], axis=1, join='inner')
# Find the Index Change
avg_edu['Index Change'] = avg_edu['avg_edu_2014'] - avg_edu['avg_edu_2000']
# Rename the columns
avg_edu.columns = ['Average Education Index 2000','Average Education Index 2014','Index Change']
avg_edu.head()

Unnamed: 0_level_0,Average Education Index 2000,Average Education Index 2014,Index Change
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
94305,17.76,17.69,-0.07
95041,17.12,15.65,-1.47
94708,17.01,17.12,0.11
94304,17.0,17.02,0.02
94707,16.83,17.15,0.32


## Median House Value

In [6]:
# ETL median house price 2000
med_house_2000 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/house_median_value_2000.csv')
# Drop Unnamed: 0
med_house_2000 = med_house_2000.drop('Unnamed: 0', axis=1)
# Set zip as index
med_house_2000.set_index('zip', inplace=True)
# Remove the symbols and set values as a float
med_house_2000['house_median_value_2000'] = med_house_2000['house_median_value_2000'].str.replace(',','').str.replace('$','').astype(float)
med_house_2000.head()

Unnamed: 0_level_0,house_median_value_2000
zip,Unnamed: 1_level_1
90210,1000001.0
90402,1000001.0
90743,1000001.0
92067,1000001.0
93108,1000001.0


In [7]:
# ETL median house value 2014
med_house_2014 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/house_median_value_2014.csv')
# Drop Unnamed: 0
med_house_2014 = med_house_2014.drop('Unnamed: 0', axis=1)
# Set zip as index
med_house_2014.set_index('zip', inplace=True)
# Remove the symbols and set values as a float
med_house_2014['house_median_value_2014'] = med_house_2014['house_median_value_2014'].str.replace(',','').str.replace('$','').astype(float)
med_house_2014.head()

Unnamed: 0_level_0,house_median_value_2014
zip,Unnamed: 1_level_1
90024,1000001.0
90049,1000001.0
90077,1000001.0
90210,1000001.0
90211,1000001.0


In [8]:
# Concat the two median house value datasets
med_house = pd.concat([med_house_2000,med_house_2014],axis=1,join='inner')
# Find the percent change
med_house['Percent Change in Cost for House'] = (med_house['house_median_value_2014'] - med_house['house_median_value_2000'])/med_house['house_median_value_2000']
# Rename the columns
med_house.columns = ['Median House Value 2000','Median House Value 2014', '% Change in House Value']
med_house.head()

Unnamed: 0_level_0,Median House Value 2000,Median House Value 2014,% Change in House Value
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
90210,1000001.0,1000001.0,0.0
90402,1000001.0,1000001.0,0.0
90743,1000001.0,1000001.0,0.0
92067,1000001.0,1000001.0,0.0
93108,1000001.0,1000001.0,0.0


## Median Income

In [9]:
# ETL median income 2000
med_inc_2000 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/median_income_2000.csv')
# Drop Unnamed: 0
med_inc_2000 = med_inc_2000.drop('Unnamed: 0', axis=1)
# Set zip as index
med_inc_2000.set_index('zip', inplace=True)
# Remove symbols and set values as a float
med_inc_2000['median_income_2000'] = med_inc_2000['median_income_2000'].str.replace(',', '').str.replace('$','').astype(float)
med_inc_2000.head()

Unnamed: 0_level_0,median_income_2000
zip,Unnamed: 1_level_1
94027,200001.0
92067,196298.0
94028,164479.0
94022,145425.0
94506,142459.0


In [10]:
# ETL median income 2014
med_inc_2014 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/median_income_2014.csv')
# Set zip as index
med_inc_2014.set_index('zip', inplace=True)
# Drop Unnamed: 0
med_inc_2014=med_inc_2014.drop('Unnamed: 0', axis=1)
# Remove symbols and set values as a float
med_inc_2014['median_income_2014'] = med_inc_2014['median_income_2014'].str.replace(',', '').str.replace('$','').astype(float)
med_inc_2014.head()

Unnamed: 0_level_0,median_income_2014
zip,Unnamed: 1_level_1
94027,236912.0
92145,228587.0
91980,200325.0
94957,187857.0
94022,182750.0


In [11]:
# Concate the two median income datasets
med_inc = pd.concat([med_inc_2000,med_inc_2014], axis=1, join='inner')
# Find the percent change
med_inc['Percent Change Income Change'] = (med_inc['median_income_2014'] - med_inc['median_income_2000'])/med_inc['median_income_2000']
# Round the values
med_inc = med_inc.round(2)
# Rename the columns
med_inc.columns = ['Median Income 2000','Median Income 2014','% Change in Income']
med_inc.head()

Unnamed: 0_level_0,Median Income 2000,Median Income 2014,% Change in Income
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
94027,200001.0,236912.0,0.18
92067,196298.0,119939.0,-0.39
94028,164479.0,180174.0,0.1
94022,145425.0,182750.0,0.26
94506,142459.0,176241.0,0.24


## Median Rent

In [12]:
# ETL median rent 2000
med_rent_2000 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/median_rent_2000.csv')
# Set zip as index
med_rent_2000.set_index('zip', inplace=True)
# Drop Unnamed: 0
med_rent_2000 = med_rent_2000.drop('Unnamed: 0', axis=1)
# Remove symbols and set values as a float
med_rent_2000['median_rent_2000'] = med_rent_2000['median_rent_2000'].str.replace(',', '').str.replace('$','').astype(float)
med_rent_2000.head()

Unnamed: 0_level_0,median_rent_2000
zip,Unnamed: 1_level_1
92067,2001.0
92091,2001.0
92602,2001.0
93953,2001.0
94027,2001.0


In [13]:
# ETL median rent 2014
med_rent_2014 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/median_rent_2014.csv')
# Set zip as index
med_rent_2014.set_index('zip', inplace=True)
# Drop Unnamed: 0
med_rent_2014 = med_rent_2014.drop('Unnamed: 0', axis=1)
# Remove symbols and set values as a float
med_rent_2014['median_rent_2014'] = med_rent_2014['median_rent_2014'].str.replace(',', '').str.replace('$','').astype(float)
med_rent_2014.head()

Unnamed: 0_level_0,median_rent_2014
zip,Unnamed: 1_level_1
90067,2001.0
90077,2001.0
90094,2001.0
90210,2001.0
90265,2001.0


In [14]:
# Concat median rent datasets
med_rent = pd.concat([med_rent_2000,med_rent_2014], axis=1, join='inner')
# Find the percent change in rent
med_rent['Rent_Change'] = (med_rent['median_rent_2014'] - med_rent['median_rent_2000'])/med_rent['median_rent_2000']
# Rename the columns
med_rent.columns = ['Median Rent 2000','Median Rent 2014','% Change in Rent']
# Round the values
med_rent = med_rent.round(3)
med_rent.head()

Unnamed: 0_level_0,Median Rent 2000,Median Rent 2014,% Change in Rent
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
92067,2001.0,2001.0,0.0
92091,2001.0,2001.0,0.0
92602,2001.0,1838.0,-0.081
93953,2001.0,2001.0,0.0
94027,2001.0,2001.0,0.0


## Public Transportation

In [15]:
# ETL public transportation 2000
pub_trans_2000 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/public_transportation_2000.csv')
# Set zip as index
pub_trans_2000.set_index('zip', inplace=True)
# Drop Unnamed: 0
pub_trans_2000 = pub_trans_2000.drop('Unnamed: 0', axis=1)
# Remove symbols and set values as a float
pub_trans_2000['public_transportation_2000'] = pub_trans_2000['public_transportation_2000'].str.rstrip('%').astype('float')/100.0
pub_trans_2000.head()

Unnamed: 0_level_0,public_transportation_2000
zip,Unnamed: 1_level_1
90017,0.589
90057,0.444
95431,0.43
94102,0.426
94103,0.399


In [16]:
# ETL public transportation
pub_trans_2014 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/public_transportation_2014.csv')
# Set zip as index
pub_trans_2014.set_index('zip', inplace=True)
# Drop Unnamed: 0
pub_trans_2014 = pub_trans_2014.drop('Unnamed: 0', axis=1)
# Remove symbols and set values as a float
pub_trans_2014['public_transportation_2014'] = pub_trans_2014['public_transportation_2014'].str.rstrip('%').astype('float')/100.0
pub_trans_2014.head()

Unnamed: 0_level_0,public_transportation_2014
zip,Unnamed: 1_level_1
92304,1.0
94128,1.0
93634,0.727
90073,0.478
94130,0.476


In [17]:
# Concat public transportation datasets
pub_trans = pd.concat([pub_trans_2000,pub_trans_2014],axis=1,join='inner')
# Find percent change for public transportation
pub_trans['% Change in Public Transp'] = (pub_trans['public_transportation_2014'] - pub_trans['public_transportation_2000'])/pub_trans['public_transportation_2000']
# Renname columns
pub_trans.columns = ['Take Public Transp % in 2000','Take Public Transp % in 2014','% Change in Public Transp']
pub_trans.head()

Unnamed: 0_level_0,Take Public Transp % in 2000,Take Public Transp % in 2014,% Change in Public Transp
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
90017,0.589,0.374,-0.365025
90057,0.444,0.451,0.015766
95431,0.43,0.13,-0.697674
94102,0.426,0.473,0.110329
94103,0.399,0.358,-0.102757


## Total Population

In [18]:
# ETL total population 2000
tot_pop_2000 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/total_pop_2000.csv')
# Set zip as index
tot_pop_2000.set_index('zip', inplace=True)
# Drop Unnamed: 0
tot_pop_2000 = tot_pop_2000.drop('Unnamed: 0', axis=1)
# Set values as a float
tot_pop_2000 = tot_pop_2000.astype(float)
tot_pop_2000.head()

Unnamed: 0_level_0,total_pop_2000
zip,Unnamed: 1_level_1
90201,105275.0
90650,103211.0
90011,101214.0
92054,98226.0
91331,97300.0


In [19]:
# ETL total population 2014
tot_pop_2014 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/total_pop_2014.csv')
# Set zip as index
tot_pop_2014.set_index('zip', inplace=True)
# Drop Unnamed: 0
tot_pop_2014 = tot_pop_2014.drop('Unnamed: 0', axis=1)
# Set values as a float
tot_pop_2014 = tot_pop_2014.astype(float)
tot_pop_2014.head()

Unnamed: 0_level_0,total_pop_2014
zip,Unnamed: 1_level_1
90650,106521.0
90011,102926.0
90201,102515.0
91331,102367.0
92335,99580.0


In [20]:
# Concat total population datasets
tot_pop = pd.concat([tot_pop_2000,tot_pop_2014],axis=1,join='inner')
# Find percent change of total population
tot_pop['Total Population % Change'] = (tot_pop['total_pop_2014'] - tot_pop['total_pop_2000'])/tot_pop['total_pop_2000']
# Rename columns
tot_pop.columns = ['Total Population in 2000','Total Population in 2014','% Change in Total Population']
# Round values
tot_pop = tot_pop.round(3)
tot_pop.head()

Unnamed: 0_level_0,Total Population in 2000,Total Population in 2014,% Change in Total Population
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
90201,105275.0,102515.0,-0.026
90650,103211.0,106521.0,0.032
90011,101214.0,102926.0,0.017
92054,98226.0,42992.0,-0.562
91331,97300.0,102367.0,0.052


## White Population

In [21]:
# ETL white population 2000
white_pop_2000 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/white_pop_2000.csv')
# Set zip as index
white_pop_2000.set_index('zip', inplace=True)
# Drop Unnamed: 0
white_pop_2000 = white_pop_2000.drop('Unnamed: 0', axis=1)
# Set values as a float
white_pop_2000['white_pop_2000'] = white_pop_2000['white_pop_2000'].str.rstrip('%').astype('float')/100.0
white_pop_2000.head()

Unnamed: 0_level_0,white_pop_2000
zip,Unnamed: 1_level_1
90263,1.0
91743,1.0
92338,1.0
95232,1.0
95735,1.0


In [22]:
# ETL white population 2014
white_pop_2014 = pd.read_csv('https://databootcamp-csvfiles.s3.amazonaws.com/white_pop_2014.csv')
# Set zip as index
white_pop_2014.set_index('zip', inplace=True)
# Drop Unnamed: 0
white_pop_2014 = white_pop_2014.drop('Unnamed: 0', axis=1)
# Set values as a float
white_pop_2014['white_pop_2014'] = white_pop_2014['white_pop_2014'].str.rstrip('%').astype('float')/100.0
white_pop_2014.head()

Unnamed: 0_level_0,white_pop_2014
zip,Unnamed: 1_level_1
91948,1.0
92060,1.0
92266,1.0
92304,1.0
92332,1.0


In [23]:
# Concat white population datasets
white_pop = pd.concat([white_pop_2000,white_pop_2014],axis=1,join='inner')
# Find percent change in white population
white_pop['%_change'] = (white_pop['white_pop_2014'] - white_pop['white_pop_2000'])/white_pop['white_pop_2000']
# Rename columns
white_pop.columns = ['White Population % in 2000','White Population % in 2014','% Change in White Population %']
# Round values
white_pop = white_pop.round(3)
white_pop.head()

Unnamed: 0_level_0,White Population % in 2000,White Population % in 2014,% Change in White Population %
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
90263,1.0,0.536,-0.464
95232,1.0,0.94,-0.06
95735,1.0,1.0,0.0
95736,1.0,0.484,-0.516
96106,0.987,0.84,-0.149


In [24]:
# Create two dataframe values
dfs1 = [med_house,med_inc,med_rent]
dfs2 = [avg_edu,pub_trans,tot_pop,white_pop]

In [25]:
# Create econ_df
econ_df=pd.concat(dfs1, axis=1, join='inner')
# Create a copy
econ_blank = econ_df.copy()
# Create an Outcome column
econ_blank['Outcome']=''
econ_blank.head()

Unnamed: 0_level_0,Median House Value 2000,Median House Value 2014,% Change in House Value,Median Income 2000,Median Income 2014,% Change in Income,Median Rent 2000,Median Rent 2014,% Change in Rent,Outcome
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
90210,1000001.0,1000001.0,0.0,112572.0,138750.0,0.23,1307.0,2001.0,0.531,
90402,1000001.0,1000001.0,0.0,118553.0,131607.0,0.11,977.0,1849.0,0.893,
90743,1000001.0,1000001.0,0.0,61786.0,106458.0,0.72,850.0,1872.0,1.202,
92067,1000001.0,1000001.0,0.0,196298.0,119939.0,-0.39,2001.0,2001.0,0.0,
93108,1000001.0,1000001.0,0.0,101575.0,125272.0,0.23,1267.0,2001.0,0.579,


In [26]:
# Create demo_df
demo_df=pd.concat(dfs2, axis=1, join='inner')
# Create a copy
demo_blank = demo_df.copy()
# Create an Outcome column
demo_blank['Outcome']=''
demo_blank.head()

Unnamed: 0_level_0,Average Education Index 2000,Average Education Index 2014,Index Change,Take Public Transp % in 2000,Take Public Transp % in 2014,% Change in Public Transp,Total Population in 2000,Total Population in 2014,% Change in Total Population,White Population % in 2000,White Population % in 2014,% Change in White Population %,Outcome
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
94305,17.76,17.69,-0.07,0.02,0.034,0.7,13371.0,13538.0,0.012,0.606,0.59,-0.026,
94708,17.01,17.12,0.11,0.131,0.157,0.198473,10730.0,11143.0,0.038,0.839,0.818,-0.025,
94304,17.0,17.02,0.02,0.032,0.046,0.4375,1704.0,3688.0,1.164,0.731,0.704,-0.037,
94707,16.83,17.15,0.32,0.142,0.203,0.429577,11880.0,12402.0,0.044,0.832,0.812,-0.024,
94709,16.61,16.83,0.22,0.193,0.189,-0.020725,10140.0,12030.0,0.186,0.682,0.625,-0.084,


In [27]:
# Load econ file as csv
econ_blank.to_csv('Resources/economics.csv', index = 'zip', header=True)
# Load demo file as csv
demo_blank.to_csv('Resources/demographics.csv', index = 'zip', header=True)

In [28]:
# Import AWS S3 access keys
s3 = boto3.client('s3',
         aws_access_key_id=ACCESS_ID,
         aws_secret_access_key= ACCESS_KEY)
# Upload economics.csv to AWS S3 bucket
with open("Resources/economics.csv", "rb") as f:
    s3.upload_fileobj(f, "gentrificationmldata", "economics.csv")

In [29]:
# Import AWS S3 access keys
s3 = boto3.client('s3',
         aws_access_key_id=ACCESS_ID,
         aws_secret_access_key= ACCESS_KEY)
# Upload demographics.csv to AWS S3 bucket
with open("Resources/demographics.csv", "rb") as f:
    s3.upload_fileobj(f, "gentrificationmldata", "demographics.csv")

In [30]:
# bucket = 'https://gentrificationmldata.s3.amazonaws.com'
# csv_buffer = StringIO()
# df.to_csv(csv_buffer)
# s3_resource = boto3.resource('s3')
# s3_resource.Object(bucket, 'df.csv').put(Body=csv_buffer.getvalue())