In [2]:

# Dependencies for statistical analysis

import pandas as pd
import numpy as np
import scipy.stats as stats
from pathlib import Path
from py_scripts.functions import Master_NC_Dataframe
from py_scripts.functions import column_list
from py_scripts.functions import data_categories



In [3]:

#Building the dataframe for data relevant to probabilities and proportions, part 1

race_and_infant_death = data_categories['per_1000']['Female_Other_Age'] + data_categories['per_1000']['Female_White_Age'] + data_categories['per_1000']['Poverty and Welfare'] + data_categories['per_1000']['Life Events and Family Planning']
racebyinfant_df = Master_NC_Dataframe[race_and_infant_death]

#Filtering down to the most populous counties

racebyinfant_df = racebyinfant_df.filter(items = [91, 59, 40, 33, 25], axis=0)



In [4]:
#Cleaning the dataframe and appropriately renaming the index to county names
finalinfantdf = racebyinfant_df.T
finalinfantdf.columns = ['Wake', 'Mecklenburg', 'Guilford', 'Forsyth', 'Cumberland']

finalinfantdf= finalinfantdf.T


In [5]:
#Using data from 

Female_Demo = { 'Counties' : ['Wake County', 'Mecklenburg County', 'Guilford County', 'Forsyth County', 'Cumberland County'],
                  'Percentages (White)': [0.571*0.517, 0.447*0.519, 0.472*0.527, 0.544*0.526, 0.398*0.505],
                  'Percentages (Black)': [0.181*0.517, 0.291*0.519, 0.331*0.527, 0.245*0.526, 0.371*0.505],
                  'White Population': [1197936*0.571*0.517, 1174237*0.447*0.519, 557166*0.472*0.527,
                                       392166*0.544*0.526, 339318*0.398*0.505],
                  'Black Population': [1197936*0.181*0.517, 1174237*0.291*0.519, 557166*0.331*0.527, 
                                       392166*0.245*0.526, 339318*0.371*0.505],
                  }

Female_top5_df = pd.DataFrame(Female_Demo).round(3)

Female_top5_df

Unnamed: 0,Counties,Percentages (White),Percentages (Black),White Population,Black Population
0,Wake County,0.295,0.094,353639.093,112099.257
1,Mecklenburg County,0.232,0.151,272414.764,177343.84
2,Guilford County,0.249,0.174,138591.7,97190.366
3,Forsyth County,0.286,0.129,112215.948,50538.432
4,Cumberland County,0.201,0.187,68199.525,63572.924


In [6]:
whiteblack_df = finalinfantdf[['White Persons in Poverty', 'Black Persons in Poverty']]

hello = whiteblack_df.reset_index()

hello2 = hello.rename(columns={'index':'Counties'})

hello2.replace('Wake', 'Wake County', inplace=True)
hello2.replace('Mecklenburg', 'Mecklenburg County', inplace=True)   
hello2.replace('Guilford', 'Guilford County', inplace=True) 
hello2.replace('Forsyth', 'Forsyth County', inplace=True)   
hello2.replace('Cumberland', 'Cumberland County', inplace=True)

poverty_db = pd.DataFrame.merge(Female_top5_df, hello2, how = 'outer', on='Counties')

poverty_db = poverty_db.set_index('Counties')
poverty_db['White Persons in Poverty'] = poverty_db['White Persons in Poverty'].astype(float)
poverty_db['Black Persons in Poverty'] = poverty_db['Black Persons in Poverty'].astype(float)
poverty_db['White Population'] = poverty_db['White Population'].astype(float)
poverty_db['Black Population'] = poverty_db['Black Population'].astype(float)

poverty_db['Percentage White in Poverty'] = poverty_db['White Persons in Poverty'] / poverty_db['White Population']
poverty_db['Percentage Black in Poverty'] = poverty_db['Black Persons in Poverty'] / poverty_db['Black Population']

poverty_db

Unnamed: 0_level_0,Percentages (White),Percentages (Black),White Population,Black Population,White Persons in Poverty,Black Persons in Poverty,Percentage White in Poverty,Percentage Black in Poverty
Counties,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Wake County,0.295,0.094,353639.093,112099.257,41457.0,28820.0,0.11723,0.257094
Mecklenburg County,0.232,0.151,272414.764,177343.84,40657.0,47419.0,0.149247,0.267385
Guilford County,0.249,0.174,138591.7,97190.366,26870.0,39837.0,0.193879,0.409886
Forsyth County,0.286,0.129,112215.948,50538.432,28969.0,22023.0,0.258154,0.435767
Cumberland County,0.201,0.187,68199.525,63572.924,21133.0,28593.0,0.30987,0.449767


In [7]:
#In lieu of a Chi-Square test (since we are stuck with proportions and not pure counts), we compare the proportions.
# The null would be that the proportion of white to black populations should be the same as the proportion of white to black populations in poverty.
# A quick division of the two proportions shows that the proportion of white to black populations does not follow the same proportion as the proportion of white to black populations in poverty.

chi_groups = pd.DataFrame(poverty_db[['Percentages (White)', 'Percentages (Black)', 'Percentage White in Poverty' ,'Percentage Black in Poverty']])
chi_groups["Proportion of White to Black"] = chi_groups['Percentages (White)']/chi_groups['Percentages (Black)']
chi_groups["Proportion of White in Poverty to Black in Poverty"] = chi_groups['Percentage White in Poverty']/chi_groups['Percentage Black in Poverty']
Proportions = chi_groups[['Proportion of White to Black', 'Proportion of White in Poverty to Black in Poverty']]

Proportions




Unnamed: 0_level_0,Proportion of White to Black,Proportion of White in Poverty to Black in Poverty
Counties,Unnamed: 1_level_1,Unnamed: 2_level_1
Wake County,3.138298,0.455981
Mecklenburg County,1.536424,0.558172
Guilford County,1.431034,0.473006
Forsyth County,2.217054,0.592412
Cumberland County,1.074866,0.688957


In [8]:
#Now, building a dataframe for conditional probabilities 

probab_df = data_categories['per_1000']['Birth and Death Statistics'][:1] + data_categories['per_1000']['Female_White_Age'][3:19:1] + data_categories['per_1000']['Female_Other_Age'][3:19:1] + data_categories['per_1000']['Poverty and Welfare'] + data_categories['per_1000']['Life Events and Family Planning'][3:4]
prob_df = Master_NC_Dataframe[probab_df]


prob_df = prob_df.filter(items = [91, 59, 40, 33, 25], axis=0)

prob_df_final = prob_df[['Infant Deaths', 'Pregnancies for Females of All Ages', 'Persons in Poverty', 'Black Persons in Poverty', 'White Persons in Poverty', 'Am. Indian Alaska Native Persons in Poverty',
                         'Black Population','Hispanic Persons in Poverty', 'Families in Poverty', 'Families in Poverty with Female Householder', 'Families in Poverty/Female Householder & Children']]


prob_df_final


Unnamed: 0,Infant Deaths,Pregnancies for Females of All Ages,Persons in Poverty,Black Persons in Poverty,White Persons in Poverty,Am. Indian Alaska Native Persons in Poverty,Black Population,Hispanic Persons in Poverty,Families in Poverty,Families in Poverty with Female Householder,Families in Poverty/Female Householder & Children
91,69.0,15921.0,91083.0,28820.0,41457.0,316.0,208493.0,17637.0,14718.0,7230.0,6355.0
59,75.0,19326.0,117474.0,47419.0,40657.0,870.0,330458.0,29629.0,19972.0,11959.0,10563.0
40,45.0,8005.0,78408.0,39837.0,26870.0,696.0,181848.0,10701.0,14556.0,8949.0,7931.0
33,31.0,5401.0,58180.0,22023.0,28969.0,321.0,95324.0,14833.0,10181.0,5724.0,4861.0
25,50.0,6631.0,59484.0,28593.0,21133.0,1146.0,127610.0,7778.0,11950.0,7113.0,6225.0


In [9]:
#Renaming the index to the county names
prob_df_final["White Female Population"] = [663832*0.513, 520567*0.519, 263428*0.527, 214877*0.526, 141912*0.505]
prob_df_final["Black Female Population"] = [208439*0.513, 330458*0.519, 181848*0.527, 95324*0.526, 127610*0.505]
prob_df_final["White Population"] = [663832, 520567, 263428, 214877, 141912]
prob_df_final = prob_df_final.T
prob_df_final.columns = ['Wake', 'Mecklenburg', 'Guilford', 'Forsyth', 'Cumberland']

prob_df_final= prob_df_final.T

prob_df_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prob_df_final["White Female Population"] = [663832*0.513, 520567*0.519, 263428*0.527, 214877*0.526, 141912*0.505]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prob_df_final["Black Female Population"] = [208439*0.513, 330458*0.519, 181848*0.527, 95324*0.526, 127610*0.505]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

Unnamed: 0,Infant Deaths,Pregnancies for Females of All Ages,Persons in Poverty,Black Persons in Poverty,White Persons in Poverty,Am. Indian Alaska Native Persons in Poverty,Black Population,Hispanic Persons in Poverty,Families in Poverty,Families in Poverty with Female Householder,Families in Poverty/Female Householder & Children,White Female Population,Black Female Population,White Population
Wake,69.0,15921.0,91083.0,28820.0,41457.0,316.0,208493.0,17637.0,14718.0,7230.0,6355.0,340545.816,106929.207,663832.0
Mecklenburg,75.0,19326.0,117474.0,47419.0,40657.0,870.0,330458.0,29629.0,19972.0,11959.0,10563.0,270174.273,171507.702,520567.0
Guilford,45.0,8005.0,78408.0,39837.0,26870.0,696.0,181848.0,10701.0,14556.0,8949.0,7931.0,138826.556,95833.896,263428.0
Forsyth,31.0,5401.0,58180.0,22023.0,28969.0,321.0,95324.0,14833.0,10181.0,5724.0,4861.0,113025.302,50140.424,214877.0
Cumberland,50.0,6631.0,59484.0,28593.0,21133.0,1146.0,127610.0,7778.0,11950.0,7113.0,6225.0,71665.56,64443.05,141912.0


In [10]:
#Finding the conditional probabilities relevant 



ProbInf_by_Preg = round(prob_df_final['Infant Deaths'].sum() / prob_df_final['Pregnancies for Females of All Ages'].sum(), 4)
Prob_INF_by_Poverty = round(prob_df_final['Infant Deaths'].sum() / prob_df_final['Persons in Poverty'].sum(), 4)
Prob_Black_Poverty = round(prob_df_final['Black Persons in Poverty'].sum() / prob_df_final['Black Female Population'].sum(), 3)
Prob_White_Poverty = round(prob_df_final['White Persons in Poverty'].sum() / prob_df_final['White Female Population'].sum(),3)
Prob_INF_by_Race_W = round(prob_df_final['Infant Deaths'].sum() / prob_df_final['White Persons in Poverty'].sum(), 6) 
Prob_INF_by_Race_B = round(prob_df_final['Infant Deaths'].sum() / prob_df_final['Black Persons in Poverty'].sum(), 6)

print(f'The likelihood that you will be in poverty, given you are a black woman is {Prob_Black_Poverty}, \n which is much greater than the probability of being a white woman in poverty, or {Prob_White_Poverty}, even with the disparity in population.')
print("------------------------------------")
print(f"The probability of infant death given any pregnancy is {ProbInf_by_Preg}.")
print("------------------------------------")
print(f'The probabililty of a white woman in poverty having an infant death is {Prob_INF_by_Race_W}, \n whereas it is a {Prob_INF_by_Race_B} for a black woman in poverty. However, this is a slight reach \n as we do not have data on the racial makeup of those infant deaths.')



The likelihood that you will be in poverty, given you are a black woman is 0.341, 
 which is much greater than the probability of being a white woman in poverty, or 0.17, even with the disparity in population.
------------------------------------
The probability of infant death given any pregnancy is 0.0049.
------------------------------------
The probabililty of a white woman in poverty having an infant death is 0.001697, 
 whereas it is a 0.00162 for a black woman in poverty. However, this is a slight reach 
 as we do not have data on the racial makeup of those infant deaths.


In [11]:
#Conclusions and interpretations of the data


prob_df_final_county = prob_df_final.copy([['Black Population', 'White Population']])

prob_df_final_county['Black percentage in Poverty'] = prob_df_final['Black Persons in Poverty']/prob_df_final['Black Population']
prob_df_final_county['White percentage in Poverty'] = prob_df_final['White Persons in Poverty']/prob_df_final['White Population']
prob_df_final_county['Factor Difference (B/W)'] = prob_df_final_county['Black percentage in Poverty']/prob_df_final_county['White percentage in Poverty']


print(f'------------------------------------')

print(f'The following show the likelihood of a Black person being in poverty compared to a White person \n in poverty in the top 5 most populous counties in NC. The factor difference is stark in comparison to the percentage of population within poverty.')
prob_df_final_county['Factor Difference (B/W)']


------------------------------------
The following show the likelihood of a Black person being in poverty compared to a White person 
 in poverty in the top 5 most populous counties in NC. The factor difference is stark in comparison to the percentage of population within poverty.


Wake           2.213415
Mecklenburg    1.837289
Guilford       2.147694
Forsyth        1.713684
Cumberland     1.504641
Name: Factor Difference (B/W), dtype: float64