In [1]:
# Import dependencies
import json
import pandas as pd
import numpy as np
import glob
import csv
import datetime
from sqlalchemy import create_engine
import psycopg2
# from config import db_password, ACCESS_ID, ACCESS_KEY
import time

In [2]:
# Extract predicted.csv from AWS S3 Bucket
predicted_df = pd.read_csv('https://gentrificationoutcome.s3.amazonaws.com/predicted.csv')
# Set zip as index
predicted_df.set_index('zip', inplace=True)
# Rename outcome column
predicted_df = predicted_df.rename(columns={'outcome':'Outcome'})
predicted_df.head()

Unnamed: 0_level_0,percent_change_house,percent_change_income,percent_change_rent,index_change,percent_change_public_tranp,percent_change_population,percent_change_white,Outcome
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
96094,1.032225,0.34,0.8,0.72,-0.833333,0.083,0.042,1
95815,0.355689,0.14,0.672,0.45,-0.520548,0.017,-0.032,1
93250,0.570118,0.39,0.739,0.03,0.666667,0.292,1.748,0
92705,0.922709,0.33,0.57,0.24,0.166667,0.047,0.071,0
94520,0.542403,0.16,0.489,0.31,0.029703,0.071,0.079,1


In [3]:
# Extract california_blank_tagged.csv from AWS S3 Bucket
california_df =pd.read_csv('https://gentrificationmldata.s3.amazonaws.com/california_blank_tagged.csv')
# Set zip as index
california_df.set_index('zip', inplace=True)
california_df.head()

Unnamed: 0_level_0,median_house_value_2000,median_house_value_2014,percent_change_house,median_income_2000,median_income_2014,percent_change_income,median_rent_2000,median_rent_2014,percent_change_rent,average_education_index_2000,...,percent_public_transp_2000,percent_public_transp_2014,percent_change_public_tranp,population_2000,population_2014,percent_change_population,percent_white_2000,percent_white_2014,percent_change_white,Outcome
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90001,138900,241800,0.740821,24207,34050,0.41,568,975,0.717,9.49,...,0.172,0.167,-0.02907,54481,56314,0.034,0.246,0.499,1.028,0
90002,127700,218800,0.713391,22872,30214,0.32,560,968,0.729,10.06,...,0.123,0.137,0.113821,44584,50098,0.124,0.166,0.484,1.916,0
90003,134400,228600,0.700893,22346,30016,0.34,572,1064,0.86,9.91,...,0.173,0.175,0.011561,58187,66913,0.15,0.193,0.224,0.161,0
90004,371100,724900,0.953382,27591,38493,0.4,600,1070,0.783,11.83,...,0.216,0.231,0.069444,67850,63547,-0.063,0.338,0.345,0.021,0
90005,392200,635500,0.620347,21998,31214,0.42,547,939,0.717,11.37,...,0.342,0.351,0.026316,43014,38638,-0.102,0.278,0.196,-0.295,0


In [4]:
# Create a copy of the two dataframes
pred_copy = predicted_df.copy()
cali_copy = california_df.copy()

In [5]:
# Replace Outcome values from 1 to 2
pred_copy.Outcome = pred_copy.Outcome.replace({1:2})

In [6]:
# Check if replace was successful
pred_copy.Outcome.unique()

array([2, 0])

In [7]:
# Update the cali_copy with pred_copy values
cali_copy.update(pred_copy)

In [8]:
# Check if Outcome values update was successful
cali_copy.Outcome.unique()

array([0., 1., 2.])

In [9]:
# Check if all the values are still there
cali_copy.Outcome.count()

1090

In [10]:
# Create a new data frame by setting the Outcome as int32
new_df = cali_copy.astype({'Outcome':'int32'})

In [11]:
# Print dataframe to check if everything looks good
new_df

Unnamed: 0_level_0,median_house_value_2000,median_house_value_2014,percent_change_house,median_income_2000,median_income_2014,percent_change_income,median_rent_2000,median_rent_2014,percent_change_rent,average_education_index_2000,...,percent_public_transp_2000,percent_public_transp_2014,percent_change_public_tranp,population_2000,population_2014,percent_change_population,percent_white_2000,percent_white_2014,percent_change_white,Outcome
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90001,138900,241800,0.740821,24207,34050,0.41,568,975,0.717,9.49,...,0.172,0.167,-0.029070,54481,56314,0.034,0.246,0.499,1.028,0
90002,127700,218800,0.713391,22872,30214,0.32,560,968,0.729,10.06,...,0.123,0.137,0.113821,44584,50098,0.124,0.166,0.484,1.916,0
90003,134400,228600,0.700893,22346,30016,0.34,572,1064,0.860,9.91,...,0.173,0.175,0.011561,58187,66913,0.150,0.193,0.224,0.161,0
90004,371100,724900,0.953382,27591,38493,0.40,600,1070,0.783,11.83,...,0.216,0.231,0.069444,67850,63547,-0.063,0.338,0.345,0.021,0
90005,392200,635500,0.620347,21998,31214,0.42,547,939,0.717,11.37,...,0.342,0.351,0.026316,43014,38638,-0.102,0.278,0.196,-0.295,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96141,350000,509100,0.454571,75863,57212,-0.25,725,1221,0.684,14.51,...,0.025,0.089,2.560000,840,669,-0.204,0.973,0.937,-0.037,0
96143,222800,377100,0.692549,38366,40340,0.05,688,985,0.432,12.60,...,0.097,0.043,-0.556701,4802,3855,-0.197,0.740,0.900,0.216,2
96145,381000,589300,0.546719,54545,68887,0.26,885,1340,0.514,14.54,...,0.002,0.062,30.000000,3997,3215,-0.196,0.956,0.956,0.000,1
96150,169000,348800,1.063905,40076,47500,0.19,658,913,0.388,13.29,...,0.037,0.028,-0.243243,33024,28618,-0.133,0.809,0.782,-0.033,0


In [13]:
new_df.loc[94607]

median_house_value_2000         158600.000000
median_house_value_2014         347500.000000
percent_change_house                 1.191047
median_income_2000               21124.000000
median_income_2014               34192.000000
percent_change_income                0.620000
median_rent_2000                   499.000000
median_rent_2014                   945.000000
percent_change_rent                  0.894000
average_education_index_2000        11.760000
average_education_index_2014        13.260000
index_change                         1.500000
percent_public_transp_2000           0.267000
percent_public_transp_2014           0.267000
percent_change_public_tranp          0.000000
population_2000                  21048.000000
population_2014                  24830.000000
percent_change_population            0.180000
percent_white_2000                   0.113000
percent_white_2014                   0.255000
percent_change_white                 1.257000
Outcome                           

In [13]:
# Load the dataframe as final.csv
new_df.to_csv('final.csv', index='zip', header=True)