In [82]:
import pandas as pd
import numpy as np
import requests
import io
import wbgapi as wb
from bs4 import BeautifulSoup
import cache_magic 

# Old Femicide data import

In [83]:
url = 'https://raw.githubusercontent.com/Tom-Whittington/Femicide_analysis/main/processed_data/original_femicide_data.csv'

In [84]:
%cache download = requests.get(url).content

loading cached value for variable 'download'. Time since pickling  146 days, 15:39:13.332832


In [85]:
df_old_fem = pd.read_csv(io.StringIO(download.decode('utf-8')),
                dtype={'iso3_code' : 'category',
                           'Region' : 'category',
                           'Subregion' : 'category',
                           'country' : 'category',
                           'Indicator' : 'category',
                           'Disaggregation' : 'category',
                           'Gender' : 'category',
                           'Source' : 'category',
                           'Unit' : 'category',
                          'Year' : 'string'})
df_old_fem

Unnamed: 0,Region,Subregion,country,iso3_code,Indicator,Disaggregation,Gender,Source,Unit,Year,Value,Footnote
0,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2009,1115,
1,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2009,3.93,
2,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2010,983,
3,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2010,3.37,
4,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2011,1231,
...,...,...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2006,8.82,
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2010,711,
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2010,5.6,
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2012,981,


1. NY.GNP.PCAP.CD = Gross national income per capita
2. SI.POV.GINI = GINI coefficient
3. VC.IHR.PSRC.FE.P5 = Intentional homicides, female (per 100,000 female)
4. SH.STA.SUIC.FE.P5 = Suicide mortality rate, female (per 100,000 female population)
5. SH.MMR.DTHS = Number of maternal deaths (raw)
6. SH.PRV.SMOK.FE = Prevalence of current tobacco use, females (% of female adults)
7. SP.POP.TOTL.FE.IN = Population, female

In [86]:
%cache df_wb = wb.data.DataFrame(['NY.GNP.PCAP.CD','SI.POV.GINI','VC.IHR.PSRC.FE.P5', 'SH.STA.SUIC.FE.P5', 'SH.MMR.DTHS','SP.POP.TOTL.FE.IN' ], time=range(1990,2023), labels=True, skipBlanks =True, columns='series', skipAggs=True)
df_wb

loading cached value for variable 'df_wb'. Time since pickling  0:21:46.688520


Unnamed: 0_level_0,Unnamed: 1_level_0,Country,Time,NY.GNP.PCAP.CD,SH.MMR.DTHS,SH.PRV.SMOK.FE,SH.STA.SUIC.FE.P5,SI.POV.GINI,SP.POP.TOTL.FE.IN,VC.IHR.PSRC.FE.P5
economy,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ZWE,YR2022,Zimbabwe,2022,1500.0,,,,,8614935.0,
ZWE,YR2021,Zimbabwe,2021,1530.0,,,,,8449834.0,1.869859
ZWE,YR2020,Zimbabwe,2020,1460.0,1700.0,1.5,,,8284447.0,1.762339
ZWE,YR2019,Zimbabwe,2019,1450.0,1900.0,1.6,8.8,50.3,8122618.0,2.055987
ZWE,YR2018,Zimbabwe,2018,1550.0,1700.0,1.7,8.7,,7966181.0,2.046150
...,...,...,...,...,...,...,...,...,...,...
AFG,YR1994,Afghanistan,1994,,,,,,7722096.0,
AFG,YR1993,Afghanistan,1993,,,,,,7000119.0,
AFG,YR1992,Afghanistan,1992,,,,,,6028939.0,
AFG,YR1991,Afghanistan,1991,,,,,,5372208.0,


In [87]:
df_wb = df_wb.reset_index()

df_wb = df_wb.rename(columns={'economy':'iso3',
                        'Time':'year',
                        'NY.GNP.PCAP.CD':'gnipc',
                        'SI.POV.GINI':'gini',
                        'Country':'country',
                        'SH.MMR.DTHS' : 'maternal deaths (count)',
                        'SH.PRV.SMOK.FE' : '% tobacco',
                        'VC.IHR.PSRC.FE.P5' : 'femicide/100k_fem',
                        'SH.STA.SUIC.FE.P5' : 'suicide mortality/100k_fem',
                        'SP.POP.TOTL.FE.IN' : 'fem_pop'})

df_wb = df_wb.drop(columns='time')
df_wb

Unnamed: 0,iso3,country,year,gnipc,maternal deaths (count),% tobacco,suicide mortality/100k_fem,gini,fem_pop,femicide/100k_fem
0,ZWE,Zimbabwe,2022,1500.0,,,,,8614935.0,
1,ZWE,Zimbabwe,2021,1530.0,,,,,8449834.0,1.869859
2,ZWE,Zimbabwe,2020,1460.0,1700.0,1.5,,,8284447.0,1.762339
3,ZWE,Zimbabwe,2019,1450.0,1900.0,1.6,8.8,50.3,8122618.0,2.055987
4,ZWE,Zimbabwe,2018,1550.0,1700.0,1.7,8.7,,7966181.0,2.046150
...,...,...,...,...,...,...,...,...,...,...
7156,AFG,Afghanistan,1994,,,,,,7722096.0,
7157,AFG,Afghanistan,1993,,,,,,7000119.0,
7158,AFG,Afghanistan,1992,,,,,,6028939.0,
7159,AFG,Afghanistan,1991,,,,,,5372208.0,


In [88]:
df_wb = df_wb.astype({'iso3' : 'category',
                      'country' : 'category',
                      'gnipc' : 'float',
                      '% tobacco' : 'float',
                      'suicide mortality/100k_fem' : 'float',
                      'gini' : 'float',
                      'fem_pop' : 'int32',
                      'femicide/100k_fem' : 'float'})

df_wb['year'] = pd.to_datetime(df_wb['year'], format='%Y')

df_wb

Unnamed: 0,iso3,country,year,gnipc,maternal deaths (count),% tobacco,suicide mortality/100k_fem,gini,fem_pop,femicide/100k_fem
0,ZWE,Zimbabwe,2022-01-01,1500.0,,,,,8614935,
1,ZWE,Zimbabwe,2021-01-01,1530.0,,,,,8449834,1.869859
2,ZWE,Zimbabwe,2020-01-01,1460.0,1700.0,1.5,,,8284447,1.762339
3,ZWE,Zimbabwe,2019-01-01,1450.0,1900.0,1.6,8.8,50.3,8122618,2.055987
4,ZWE,Zimbabwe,2018-01-01,1550.0,1700.0,1.7,8.7,,7966181,2.046150
...,...,...,...,...,...,...,...,...,...,...
7156,AFG,Afghanistan,1994-01-01,,,,,,7722096,
7157,AFG,Afghanistan,1993-01-01,,,,,,7000119,
7158,AFG,Afghanistan,1992-01-01,,,,,,6028939,
7159,AFG,Afghanistan,1991-01-01,,,,,,5372208,


In [89]:
df_wb = df_wb.set_index(['iso3', 'year'])

df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,maternal deaths (count),% tobacco,suicide mortality/100k_fem,gini,fem_pop,femicide/100k_fem
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ZWE,2022-01-01,Zimbabwe,1500.0,,,,,8614935,
ZWE,2021-01-01,Zimbabwe,1530.0,,,,,8449834,1.869859
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,1.5,,,8284447,1.762339
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,1.6,8.8,50.3,8122618,2.055987
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,1.7,8.7,,7966181,2.046150
...,...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,,7722096,
AFG,1993-01-01,Afghanistan,,,,,,7000119,
AFG,1992-01-01,Afghanistan,,,,,,6028939,
AFG,1991-01-01,Afghanistan,,,,,,5372208,
