In [4]:
pip install s3fs;

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import s3fs
import os
from io import StringIO

In [6]:
ACCESS_KEY_ID = 'AKIARJEUISD2VILSZ6HM'
ACCESS_SECRET_KEY = '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'
bucket = "s3://williams-citibike/TripData/"

fs = s3fs.S3FileSystem(anon=False, key = ACCESS_KEY_ID, secret= ACCESS_SECRET_KEY)
hood_filenames = fs.ls("s3://williams-citibike/HoodData/")[1:]

## Creating a Neighborhood Profile File

In [7]:
def flatten_hooddata(datafile: str) -> pd.DataFrame:
    """Grabs the data from the s3 bucket and flattens it to a single row consisting of the neighborhood attributes
    
    Parameters
    ----------
    datafile : str
        The name of a file in the s3 bucket without the s3:// prefix

    Returns
    -------
    pd.DataFrame:
        A single row DataFrame that contains the attributes of the neighborhood
    """
    
    cols_lst = [0,3,8]
    names_lst = ["code", "indicator", "2018"]
    
    with fs.open("s3://"+datafile, 'rb') as file:
        data = pd.read_excel(file, sheet_name=1, usecols = cols_lst, names = names_lst)
        
        # Prep the '2018' column so that it can used as the value argument in the pivot_table 
        data['2018'] = data['2018'].str.replace('$',"")
        data['2018'] = data['2018'].str.replace(',',"")

        # Values that are percents get turned into decimals
        for index, value in data['2018'].items():
            if isinstance(value,str):
                if value[-1] == '%':
                    data['2018'][index] = float(value.strip('%')) / 100
        
        data['2018'] = pd.to_numeric(data['2018'])
        
        # The pivot_table alphabatizes the columns, but we want to maintain the original order
        column_order = ['code'] + list(data['indicator'])
        
        data = data.pivot_table(index=['code'],values='2018', columns='indicator', dropna=False)
        data = data.rename_axis(None, axis=1).reset_index()
        data['code'] = data['code'][0].replace(" ","")
        data = data.reindex(column_order, axis=1)

    return data

In [8]:
hood_profile_df = pd.DataFrame()

# This loop only works successfully if there are those specific neighborhood excel files in the HoodData folder
for hood in hood_filenames:
    hood_profile_df = hood_profile_df.append(flatten_hooddata(hood))

In [9]:
hood_profile_df = hood_profile_df.set_index('code')

In [34]:
hood_profile_df = hood_profile_df.dropna(axis=1, how='all')

In [37]:
hood_profile_df.columns

Index(['Born in New York State', 'Disabled population',
       'Foreign-born population', 'Population', 'Population aged 65+',
       'Households with children under 18 years old',
       'Single-person households', 'Percent Asian', 'Percent Black',
       'Percent Hispanic', 'Percent white', 'Racial diversity index',
       'Income diversity ratio', 'Median household income (2019$)',
       'Median household income, homeowners (2019$)',
       'Median household income, renters (2019$)', 'Poverty rate',
       'Poverty rate, population aged 65+',
       'Poverty rate, population under 18 years old',
       'Labor force participation rate',
       'Population aged 25+ without a high school diploma',
       'Unemployment rate', 'Homeownership rate', 'Housing units',
       'Rental vacancy rate',
       'Serious housing code violations (per 1,000 privately owned rental units)',
       'Severe crowding rate (% of renter households)',
       'Total housing code violations (per 1,000 private

## Uploading Neighborhood Profile Data to Personal S3 Bucket

In [8]:
import boto3

In [9]:
s3 = boto3.resource(
     's3',
     aws_access_key_id = ACCESS_KEY_ID,
     aws_secret_access_key = ACCESS_SECRET_KEY
)

bucket = 'williams-citibike'   # Premade bucket in S3

In [10]:
hood_profile_df.to_csv("Hood_Profile_Data.csv")

In [11]:
s3.Bucket(bucket).Object("Hood_Profile_Data.csv").upload_file("Hood_Profile_Data.csv")

In [12]:
os.remove("Hood_Profile_Data.csv")

## *Testing: Look-up Table*

In [19]:
lookup = pd.read_excel("s3://" + hood_filenames[0], sheet_name=1)

In [21]:
lookup.columns

Index([    'Community District',                   'Name',
           'Indicator Category',              'Indicator',
       ' Indicator Description',                     2000,
                           2006,                     2010,
                           2018,                     2019,
                    '2000 Rank',              '2006 Rank',
                    '2010 Rank',           '2018/19 Rank'],
      dtype='object')

In [32]:
pd.options.display.max_colwidth = 175

lookup[" Indicator Description"]

0                                                                                                                   The percentage of city residents who were born in New York State.
1                                  The percentage of the adult population who have disabilities that impair hearing, vision, ambulation, cognition, self-care, or independent living.
2                                                                                                  The share of the population that is born outside the United States or Puerto Rico.
3                                                                                                            All people, both children and adults, living in a given geographic area.
4                                                                                                                        The percentage of residents who are aged 65 years and older.
                                                                                          