# Initial Exploratory Data Analysis: NYCHA Residents

## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Import and load data

In [2]:
nycha_res_raw = pd.read_csv('../../data/00_raw/NYCHA_Resident_Data_Book_Summary.csv')

In [3]:
nycha_res = nycha_res_raw.copy()

## Inspect data frame

In [4]:
nycha_res.shape

(33, 43)

In [5]:
nycha_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 43 columns):
PROGRAM                                                         33 non-null object
STATECITY_SECTION8_FLAG                                         33 non-null object
Total Families                                                  33 non-null int64
Total Female Headed Families                                    33 non-null int64
Total Male Headed Families                                      33 non-null int64
Total Population                                                33 non-null int64
Average Family Size                                             33 non-null float64
Total Minors Under 18                                           33 non-null int64
Average Minors per Family                                       33 non-null float64
Total Minors as Percent of Population                           33 non-null float64
All Average Total Gross Income                                  33 no

In [6]:
nycha_res[['PROGRAM', 'STATECITY_SECTION8_FLAG']]

Unnamed: 0,PROGRAM,STATECITY_SECTION8_FLAG
0,FEDERAL,TOTAL HOUSEHOLDS
1,FORMER NEW YORK STATE,TOTAL HOUSEHOLDS
2,FORMER NEW YORK STATE,PUBLIC HOUSING HOUSEHOLDS
3,FORMER NEW YORK STATE,SECTION 8 TRANSITION HOUSEHOLDS
4,FORMER NEW YORK CITY,TOTAL HOUSEHOLDS
5,FORMER NEW YORK CITY,PUBLIC HOUSING HOUSEHOLDS
6,FORMER NEW YORK CITY,SECTION 8 TRANSITION HOUSEHOLDS
7,MIXED FINANCE (LLC1 AND LLC2),TOTAL HOUSEHOLDS
8,MIXED FINANCE (LLC1 AND LLC2),PUBLIC HOUSING HOUSEHOLDS
9,MIXED FINANCE (LLC1 AND LLC2),SECTION 8 TRANSITION HOUSEHOLDS


## Data cleaning

Convert all names of features to lower case:

In [7]:
nycha_res.columns = map(str.lower, nycha_res.columns)
nycha_res['program'] = nycha_res['program'].str.lower()
nycha_res['statecity_section8_flag'] = nycha_res['statecity_section8_flag'].str.lower()

Check for missing value:

In [8]:
nycha_res.isnull().sum()[nycha_res.isnull().sum() > 0]

Series([], dtype: int64)

Select data by neighborhood:

In [9]:
nycha_res = nycha_res.iloc[20:]

## Data preprocessing

### Feature generation

Combine features into a new feature:

In [10]:
nycha_res['total residents of working age'] = nycha_res['residents 18 to 20'] + nycha_res['residents 21 to 49'] + nycha_res['residents 50 to 61']

Rename features:

In [13]:
nycha_res = nycha_res.rename(columns={'program':'borough','statecity_section8_flag':'nycha_household_type'})

### Investigating multicollinearity

In [11]:
from rfpimp import feature_corr_matrix, plot_corr_heatmap

In [15]:
res_df = nycha_res[nycha_res['nycha_household_type'].str.match('total households')].drop(columns=['borough','nycha_household_type'])

Separate data frame into two by type of information:

In [None]:
# nycha_res_nb_dem = nycha_res_nb[['program','statecity_section8_flag',
#                                  'all families average years in public housing','total families','total population',
#                                  'average family size',
#                                  'total minors under 18','total residents of working age','residents 62 plus',
#                                  'total families - 1 or more employed','total families - 2nd adult employed']]
# nycha_res_nb_dem = nycha_res_nb_dem.rename(columns={'program':'borough','statecity_section8_flag':'nycha_household_type'}).set_index(['borough', 'nycha_household_type'])

In [None]:
# nycha_res_nb_fin = nycha_res_nb[['program','statecity_section8_flag',
#                                  'all average total gross income','all average gross rent',
#                                 'total families on welfare','total families on full welfare',
#                                  'total families on welfare as percent of families',
#                                 'total single parent/grandparent families on welfare',
#                                 'total single parent/grandparent with minors as % of families',
#                                 'total fixed income families','total fixed income families as percent of families']]
# nycha_res_nb_fin = nycha_res_nb_fin.rename(columns={'program':'borough','statecity_section8_flag':'nycha_household_type'}).set_index(['borough', 'nycha_household_type'])

## Data visualization

In [None]:
# import matplotlib.pyplot as plt
# from pandas.api.types import CategoricalDtype
# from plotnine import *
# %matplotlib inline

In [None]:
# employed = nycha_res_nb_dem[['total families - 1 or more employed']]
# # employed.columns = employed.columns.get_level_values(0)
# # employed = employed.reset_index()
# # employed.columns = ['borough', 'public housing households',
# #                    'section 8 households','total households']
# # for col in ['public housing households', 'section 8 households', 'total households']:
# #     employed[col] = employed[col].astype('Int64')

In [None]:
# employed_pivot = employed.reset_index().pivot(index='borough',
#                                               columns='nycha_household_type',
#                                               values='total families - 1 or more employed')
# employed_pivot.plot.barh(stacked=False,
#                          figsize=(16,10),
#                          fontsize=20,
#                          title='Total Families with 1 or More Employed by Borough').set(xlabel='Total Families with 1 or More Employed',
#                                                                                         ylabel='Borough')

In [None]:
# employed_pivot.plot.barh(subplots=True,
#                          figsize=(16,10),
#                          fontsize=20)