# Dataset Creation

In [635]:
import pandas as pd
import numpy as np
import requests
import io
import wbgapi as wb
from bs4 import BeautifulSoup
import cache_magic #pip install ipython_cache
import country_converter as coco

# Femicide Data

### Femicide data is downloaded from github and loaded as df_fem

In [636]:
url = 'https://raw.githubusercontent.com/Tom-Whittington/Femicide_analysis/main/processed_data/original_femicide_data.csv'

In [637]:
%cache download = requests.get(url).content

loading cached value for variable 'download'. Time since pickling  148 days, 1:47:15.801017


In [638]:
df_fem = pd.read_csv(io.StringIO(download.decode('utf-8')),
                dtype={'iso3_code' : 'category',
                           'Region' : 'category',
                           'Subregion' : 'category',
                           'country' : 'category',
                           'Indicator' : 'category',
                           'Gender' : 'category',
                           'Source' : 'category',
                           'Unit' : 'category',
                          'Year' : 'string'})


In [639]:
df_fem

Unnamed: 0,Region,Subregion,country,iso3_code,Indicator,Disaggregation,Gender,Source,Unit,Year,Value,Footnote
0,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2009,1115,
1,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2009,3.93,
2,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2010,983,
3,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2010,3.37,
4,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2011,1231,
...,...,...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2006,8.82,
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2010,711,
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2010,5.6,
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2012,981,


In [640]:
df_fem.dtypes

Region            category
Subregion         category
country           category
iso3_code         category
Indicator         category
Disaggregation      object
Gender            category
Source            category
Unit              category
Year                string
Value               object
Footnote            object
dtype: object

### Columns are renamed

In [641]:
df_fem.columns = [col.lower() for col in df_fem.columns]
df_fem = df_fem.rename(columns={'iso3_code' : 'iso3'})
df_fem['year'] = pd.to_datetime(df_fem['year'], format='%Y')
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
0,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2009-01-01,1115,
1,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2009-01-01,3.93,
2,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2010-01-01,983,
3,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2010-01-01,3.37,
4,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2011-01-01,1231,
...,...,...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2006-01-01,8.82,
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2010-01-01,711,
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2010-01-01,5.6,
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2012-01-01,981,


In [642]:
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
0,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2009-01-01,1115,
1,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2009-01-01,3.93,
2,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2010-01-01,983,
3,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2010-01-01,3.37,
4,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2011-01-01,1231,
...,...,...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2006-01-01,8.82,
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2010-01-01,711,
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2010-01-01,5.6,
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2012-01-01,981,


### Filters out totals and only keeps male and female data

In [643]:
df_fem = df_fem.loc[df_fem['gender'] == 'Female']
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
8,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2015-01-01,93,
9,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,"Rate per 100,000 population",2015-01-01,0.56,
14,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2016-01-01,101,
15,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,"Rate per 100,000 population",2016-01-01,0.59,
20,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2017-01-01,133,
...,...,...,...,...,...,...,...,...,...,...,...,...
24285,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,Homicide: # of victims,-,Female,CTS,"Rate per 100,000 population",2016-01-01,5.36,
24290,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,Homicide: # of victims,-,Female,CTS,Count,2017-01-01,788,
24291,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,Homicide: # of victims,-,Female,CTS,"Rate per 100,000 population",2017-01-01,5.31,
24372,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Female,MD,Count,1990-01-01,183,


### Removes killed by intimate partner disaggregation

In [644]:
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
8,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2015-01-01,93,
9,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,"Rate per 100,000 population",2015-01-01,0.56,
14,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2016-01-01,101,
15,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,"Rate per 100,000 population",2016-01-01,0.59,
20,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2017-01-01,133,
...,...,...,...,...,...,...,...,...,...,...,...,...
24285,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,Homicide: # of victims,-,Female,CTS,"Rate per 100,000 population",2016-01-01,5.36,
24290,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,Homicide: # of victims,-,Female,CTS,Count,2017-01-01,788,
24291,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,Homicide: # of victims,-,Female,CTS,"Rate per 100,000 population",2017-01-01,5.31,
24372,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Female,MD,Count,1990-01-01,183,


### Filters out counts and leaves only count, sorts values and converts value column to numeric

In [645]:
df_fem = df_fem[df_fem['unit'] != 'Rate per  100,000 population']
df_fem = df_fem.sort_values(['year', 'gender', 'iso3'])
df_fem['value'] = pd.to_numeric(df_fem['value'], errors='coerce')
df_fem['iso3'] = df_fem['iso3'].replace({'KOS' : 'XKX'})
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
1192,Americas,Latin America and the Caribbean,Aruba,ABW,Homicide: # of victims,-,Female,MD,Count,1990-01-01,0.0,
898,Asia,Western Asia,Armenia,ARM,Homicide: # of victims,-,Female,UNECE,Count,1990-01-01,22.0,
674,Americas,Latin America and the Caribbean,Antigua and Barbuda,ATG,Homicide: # of victims,-,Female,MD,Count,1990-01-01,1.0,
1294,Oceania,Australia and New Zealand,Australia,AUS,Homicide: # of victims,-,Female,MD,Count,1990-01-01,138.0,
1560,Europe,Western Europe,Austria,AUT,Homicide: # of victims,-,Female,MD,Count,1990-01-01,39.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
22436,Asia,Central Asia,Tajikistan,TJK,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,2020-01-01,4.0,IPFM - Intimate partner or family member
22846,Africa,Northern Africa,Tunisia,TUN,Homicide: # of victims,-,Female,External,Count,2020-01-01,115.0,
23730,Americas,Northern America,United States of America,USA,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,FBI,Count,2020-01-01,1420.0,IPFM - Intimate partner or family member
24138,Oceania,Melanesia,Vanuatu,VUT,Homicide: # of victims,-,Female,External,Count,2020-01-01,1.0,


In [646]:
df_IPV = df_fem[df_fem['disaggregation'].str.contains('Killed')]
df_IPV

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
120,Europe,Southern Europe,Albania,ALB,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,2005-01-01,7.0,IPFM - Intimate partner or family member
992,Asia,Western Asia,Armenia,ARM,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,2005-01-01,2.0,IPFM - Intimate partner or family member
1390,Oceania,Australia and New Zealand,Australia,AUS,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,2005-01-01,70.0,IPFM - Intimate partner or family member
1654,Europe,Western Europe,Austria,AUT,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,2005-01-01,39.0,IPFM - Intimate partner or family member
2058,Americas,Latin America and the Caribbean,Bahamas,BHS,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,2005-01-01,4.0,IPFM - Intimate partner or family member
...,...,...,...,...,...,...,...,...,...,...,...,...
21990,Europe,Northern Europe,Sweden,SWE,Homicide: # of victims - perpetrator IP,Killed by Intimate partner,Female,CTS,Count,2020-01-01,13.0,IP - Intimate partner
21996,Europe,Northern Europe,Sweden,SWE,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,External,Count,2020-01-01,13.0,IPFM - Intimate partner or family member
22436,Asia,Central Asia,Tajikistan,TJK,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,2020-01-01,4.0,IPFM - Intimate partner or family member
23730,Americas,Northern America,United States of America,USA,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,FBI,Count,2020-01-01,1420.0,IPFM - Intimate partner or family member


In [647]:
df_fem = df_fem[df_fem['disaggregation'] == '-']
df_fem = df_fem.set_index(['iso3','year'])

In [648]:
df_fem

Unnamed: 0_level_0,Unnamed: 1_level_0,region,subregion,country,indicator,disaggregation,gender,source,unit,value,footnote
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ABW,1990-01-01,Americas,Latin America and the Caribbean,Aruba,Homicide: # of victims,-,Female,MD,Count,0.0,
ARM,1990-01-01,Asia,Western Asia,Armenia,Homicide: # of victims,-,Female,UNECE,Count,22.0,
ATG,1990-01-01,Americas,Latin America and the Caribbean,Antigua and Barbuda,Homicide: # of victims,-,Female,MD,Count,1.0,
AUS,1990-01-01,Oceania,Australia and New Zealand,Australia,Homicide: # of victims,-,Female,MD,Count,138.0,
AUT,1990-01-01,Europe,Western Europe,Austria,Homicide: # of victims,-,Female,MD,Count,39.0,
...,...,...,...,...,...,...,...,...,...,...,...
SVN,2020-01-01,Europe,Southern Europe,Slovenia,Homicide: # of victims,-,Female,Covid-19,Count,6.0,
SWE,2020-01-01,Europe,Northern Europe,Sweden,Homicide: # of victims,-,Female,External,Count,25.0,
TJK,2020-01-01,Asia,Central Asia,Tajikistan,Homicide: # of victims,-,Female,CTS,Count,10.0,
TUN,2020-01-01,Africa,Northern Africa,Tunisia,Homicide: # of victims,-,Female,External,Count,115.0,


# Economic data

## Worldbank data import

### Economic data is imported from worldbank API
1. NY.GNP.PCAP.CD represents gross national income per capita

2. SI.POV.GINI represents the gini coefficient of each country

3. Years 2000-2023 are selected

4. Labels also returns country code

5. skipBlanks is turned off so we have a row for each year (easier merging)

6. each series is added to column

7. skipAggs returns only countries and not regions/aggregations

8. Result is cached as can be slow to return

In [649]:
%cache df_wb = wb.data.DataFrame(['NY.GNP.PCAP.CD','SI.POV.GINI', 'SP.POP.TOTL.FE.IN', 'SH.STA.SUIC.FE.P5', 'SH.MMR.DTHS', 'VC.IHR.PSRC.FE.P5'], time=range(1990,2023), labels=True, skipBlanks =False, columns='series', skipAggs=True)

loading cached value for variable 'df_wb'. Time since pickling  1 day, 7:15:48.231817


### Index is reset, columns renamed and converted to datetime format. Extra time column is dropped and index is set to country code and year

In [650]:
df_wb = df_wb.reset_index()

df_wb = df_wb.rename(columns={'economy':'iso3',
                        'Time':'year',
                        'NY.GNP.PCAP.CD':'gnipc',
                        'SI.POV.GINI':'gini',
                        'Country':'country',
                        'SP.POP.TOTL.FE.IN' : 'fem_pop',
                        'SH.STA.SUIC.FE.P5' : 'suicide_rate',
                        'SH.MMR.DTHS' : 'mat_deaths',
                        'VC.IHR.PSRC.FE.P5' : 'femicide_rate'})

df_wb['year'] = pd.to_datetime(df_wb['year'], format='%Y')

df_wb['gnipc'] = pd.to_numeric(df_wb['gnipc'], errors='coerce')
df_wb['gini'] = pd.to_numeric(df_wb['gini'], errors='coerce')

df_wb = df_wb.drop(columns='time')

df_wb = df_wb.set_index(['iso3', 'year'])
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ZWE,2022-01-01,Zimbabwe,1500.0,,,,8614935.0,
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834.0,1.869859
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447.0,1.762339
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618.0,2.055987
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181.0,2.046150
...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,7722096.0,
AFG,1993-01-01,Afghanistan,,,,,7000119.0,
AFG,1992-01-01,Afghanistan,,,,,6028939.0,
AFG,1991-01-01,Afghanistan,,,,,5372208.0,


In [651]:
df_wb['fem_pop'] = df_wb['fem_pop'].astype('int64')

In [652]:
df_wb.isna().sum()

country             0
gnipc            1108
mat_deaths       3276
suicide_rate     3501
gini             5353
fem_pop             0
femicide_rate    4622
dtype: int64

In [653]:
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ZWE,2022-01-01,Zimbabwe,1500.0,,,,8614935,
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834,1.869859
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447,1.762339
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618,2.055987
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181,2.046150
...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,7722096,
AFG,1993-01-01,Afghanistan,,,,,7000119,
AFG,1992-01-01,Afghanistan,,,,,6028939,
AFG,1991-01-01,Afghanistan,,,,,5372208,


In [654]:
df_wb['suicide_count'] =round((df_wb['suicide_rate'] / 100000) * df_wb['fem_pop'], 0)
df_wb['femicide_count'] = round((df_wb['femicide_rate'] / 100000) * df_wb['fem_pop'], 0)
df_wb['source'] = 'WB'

In [655]:
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate,suicide_count,femicide_count,source
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ZWE,2022-01-01,Zimbabwe,1500.0,,,,8614935,,,,WB
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834,1.869859,,158.0,WB
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447,1.762339,,146.0,WB
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618,2.055987,715.0,167.0,WB
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181,2.046150,693.0,163.0,WB
...,...,...,...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,7722096,,,,WB
AFG,1993-01-01,Afghanistan,,,,,7000119,,,,WB
AFG,1992-01-01,Afghanistan,,,,,6028939,,,,WB
AFG,1991-01-01,Afghanistan,,,,,5372208,,,,WB


In [656]:
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate,suicide_count,femicide_count,source
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ZWE,2022-01-01,Zimbabwe,1500.0,,,,8614935,,,,WB
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834,1.869859,,158.0,WB
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447,1.762339,,146.0,WB
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618,2.055987,715.0,167.0,WB
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181,2.046150,693.0,163.0,WB
...,...,...,...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,7722096,,,,WB
AFG,1993-01-01,Afghanistan,,,,,7000119,,,,WB
AFG,1992-01-01,Afghanistan,,,,,6028939,,,,WB
AFG,1991-01-01,Afghanistan,,,,,5372208,,,,WB


In [657]:
df_wb_femicide =df_wb.reset_index()
df_wb_femicide = df_wb_femicide[['iso3', 'year', 'femicide_count', 'source']]
df_wb_femicide = df_wb_femicide.set_index(['iso3', 'year'])
df_wb_femicide

Unnamed: 0_level_0,Unnamed: 1_level_0,femicide_count,source
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZWE,2022-01-01,,WB
ZWE,2021-01-01,158.0,WB
ZWE,2020-01-01,146.0,WB
ZWE,2019-01-01,167.0,WB
ZWE,2018-01-01,163.0,WB
...,...,...,...
AFG,1994-01-01,,WB
AFG,1993-01-01,,WB
AFG,1992-01-01,,WB
AFG,1991-01-01,,WB


In [658]:
df_wb_femicide = df_wb_femicide.dropna()
df_wb_femicide

Unnamed: 0_level_0,Unnamed: 1_level_0,femicide_count,source
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZWE,2021-01-01,158.0,WB
ZWE,2020-01-01,146.0,WB
ZWE,2019-01-01,167.0,WB
ZWE,2018-01-01,163.0,WB
ZWE,1990-01-01,183.0,WB
...,...,...,...
ALB,1994-01-01,12.0,WB
ALB,1993-01-01,16.0,WB
ALB,1992-01-01,14.0,WB
AFG,2019-01-01,238.0,WB


Filter out all rows that are already in the previous dataset. Important to note that some years have differences in the values and frequently 2021 is significantly different (maybe due to reporting being done mid year?)

In [659]:
df_wb_femicide = df_wb_femicide[~df_wb_femicide.index.isin(df_fem.index)]
df_wb_femicide = df_wb_femicide.rename(columns={'femicide_count' : 'value'})
df_wb_femicide

Unnamed: 0_level_0,Unnamed: 1_level_0,value,source
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZWE,2021-01-01,158.0,WB
ZWE,2020-01-01,146.0,WB
ZWE,2019-01-01,167.0,WB
ZWE,2018-01-01,163.0,WB
PSE,2021-01-01,9.0,WB
...,...,...,...
DZA,2018-01-01,86.0,WB
DZA,2017-01-01,56.0,WB
DZA,2016-01-01,75.0,WB
ALB,2021-01-01,12.0,WB


In [660]:
df_fem = pd.concat([df_fem, df_wb_femicide])
df_fem = df_fem.reset_index()
df_fem[['gender', 'disaggregation', 'indicator', 'unit']] = ['Female', '-', 'Homicide: # of victims', 'Count']
df_fem = pd.concat([df_fem, df_IPV])
df_fem

Unnamed: 0,iso3,year,region,subregion,country,indicator,disaggregation,gender,source,unit,value,footnote
0,ABW,1990-01-01,Americas,Latin America and the Caribbean,Aruba,Homicide: # of victims,-,Female,MD,Count,0.0,
1,ARM,1990-01-01,Asia,Western Asia,Armenia,Homicide: # of victims,-,Female,UNECE,Count,22.0,
2,ATG,1990-01-01,Americas,Latin America and the Caribbean,Antigua and Barbuda,Homicide: # of victims,-,Female,MD,Count,1.0,
3,AUS,1990-01-01,Oceania,Australia and New Zealand,Australia,Homicide: # of victims,-,Female,MD,Count,138.0,
4,AUT,1990-01-01,Europe,Western Europe,Austria,Homicide: # of victims,-,Female,MD,Count,39.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
21990,SWE,2020-01-01,Europe,Northern Europe,Sweden,Homicide: # of victims - perpetrator IP,Killed by Intimate partner,Female,CTS,Count,13.0,IP - Intimate partner
21996,SWE,2020-01-01,Europe,Northern Europe,Sweden,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,External,Count,13.0,IPFM - Intimate partner or family member
22436,TJK,2020-01-01,Asia,Central Asia,Tajikistan,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,4.0,IPFM - Intimate partner or family member
23730,USA,2020-01-01,Americas,Northern America,United States of America,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,FBI,Count,1420.0,IPFM - Intimate partner or family member


In [661]:
converter = coco.CountryConverter()
df_fem['region'] = converter.convert(names=df_fem['iso3'], src="ISO3", to="continent")
df_fem['subregion'] = converter.convert(names=df_fem['iso3'], src="ISO3", to="UNregion")
df_fem['country'] = converter.convert(names=df_fem['iso3'], src="ISO3", to="name_short")

In [662]:
df_fem = df_fem.dropna(subset='value')
df_fem

Unnamed: 0,iso3,year,region,subregion,country,indicator,disaggregation,gender,source,unit,value,footnote
0,ABW,1990-01-01,America,Caribbean,Aruba,Homicide: # of victims,-,Female,MD,Count,0.0,
1,ARM,1990-01-01,Asia,Western Asia,Armenia,Homicide: # of victims,-,Female,UNECE,Count,22.0,
2,ATG,1990-01-01,America,Caribbean,Antigua and Barbuda,Homicide: # of victims,-,Female,MD,Count,1.0,
3,AUS,1990-01-01,Oceania,Australia and New Zealand,Australia,Homicide: # of victims,-,Female,MD,Count,138.0,
4,AUT,1990-01-01,Europe,Western Europe,Austria,Homicide: # of victims,-,Female,MD,Count,39.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
21990,SWE,2020-01-01,Europe,Northern Europe,Sweden,Homicide: # of victims - perpetrator IP,Killed by Intimate partner,Female,CTS,Count,13.0,IP - Intimate partner
21996,SWE,2020-01-01,Europe,Northern Europe,Sweden,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,External,Count,13.0,IPFM - Intimate partner or family member
22436,TJK,2020-01-01,Asia,Central Asia,Tajikistan,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,CTS,Count,4.0,IPFM - Intimate partner or family member
23730,USA,2020-01-01,America,Northern America,United States,Homicide: # of victims - perpetrator IPFM,Killed by Intimate partner or family member (I...,Female,FBI,Count,1420.0,IPFM - Intimate partner or family member


## CIA data

### Gets webpage from url

In [663]:
url = 'https://www.cia.gov/the-world-factbook/field/gini-index-coefficient-distribution-of-family-income/country-comparison'

%cache r = requests.get(url)

soup = BeautifulSoup(r.text, 'lxml')

loading cached value for variable 'r'. Time since pickling  148 days, 1:47:01.230914


### Finds first table on page, extracts headers from first row and then loops over table and creates dataframe from values

In [664]:
table = soup.findAll('table')[0]

headers = [x.text for x in table.findChildren('th')[1:]]

row_list = headers
table_list = []
for row in table.findChildren('tr'):

    for cell in row.findChildren('td')[1:]:
        row_list.append(cell.text)
    table_list.append(row_list)
    row_list = []

df_cia = pd.DataFrame(table_list)

### Adds gini label to first row and then promotes first row to headers 

In [665]:
df_cia.iloc[0,1]='gini'

df_cia.columns= df_cia.iloc[0,:]

df_cia = df_cia[1:]

df_cia

Unnamed: 0,Country,gini,Date of Information
1,South Africa,63.0,2014 est.
2,Namibia,59.1,2015 est.
3,Zambia,57.1,2015 est.
4,Central African Republic,56.2,2008 est.
5,Eswatini,54.6,2016 est.
...,...,...,...
173,Slovakia,25.2,2016 est.
174,Belarus,24.4,2020 est.
175,Slovenia,24.2,2017 est.
176,Faroe Islands,22.7,2013 est.


### iso3 codes are generated for CIA data set by the worldbank api, Nas are dropped, year and iso3 set as index ready for filling in blanks 

In [666]:
df_cia['iso3'] = wb.economy.coder(df_cia['Country'])
df_cia = df_cia.dropna(subset='iso3')
df_cia['year'] =df_cia['Date of Information'].str[:4]
df_cia = df_cia[['iso3', 'year', 'Country', 'gini']]
df_cia['year'] = pd.to_datetime(df_cia['year'], format='%Y')
df_cia = df_cia.set_index(['iso3', 'year'])
df_cia

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cia['year'] =df_cia['Date of Information'].str[:4]


Unnamed: 0_level_0,Unnamed: 1_level_0,Country,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZAF,2014-01-01,South Africa,63.0
NAM,2015-01-01,Namibia,59.1
ZMB,2015-01-01,Zambia,57.1
CAF,2008-01-01,Central African Republic,56.2
SWZ,2016-01-01,Eswatini,54.6
...,...,...,...
ARM,2020-01-01,Armenia,25.2
SVK,2016-01-01,Slovakia,25.2
BLR,2020-01-01,Belarus,24.4
SVN,2017-01-01,Slovenia,24.2


In [667]:
df_wb.isna().sum()

country              0
gnipc             1108
mat_deaths        3276
suicide_rate      3501
gini              5353
fem_pop              0
femicide_rate     4622
suicide_count     3501
femicide_count    4622
source               0
dtype: int64

### Fills in any empty cells in wb dataset with values from cia dataset ~17 extra countries. gini column set back to float

In [668]:
df_eco = df_wb.copy()

df_eco['gini'] = df_eco['gini'].fillna(df_cia['gini'])

df_eco = df_eco.sort_index()

df_eco['gini'] =df_eco['gini'].astype('float64')

df_eco

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate,suicide_count,femicide_count,source
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ABW,1990-01-01,Aruba,11570.0,,,,33480,0.000000,,0.0,WB
ABW,1991-01-01,Aruba,12560.0,,,,34657,,,,WB
ABW,1992-01-01,Aruba,13610.0,,,,35941,,,,WB
ABW,1993-01-01,Aruba,14640.0,,,,37137,,,,WB
ABW,1994-01-01,Aruba,16060.0,,,,38437,,,,WB
...,...,...,...,...,...,...,...,...,...,...,...
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181,2.046150,693.0,163.0,WB
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618,2.055987,715.0,167.0,WB
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447,1.762339,,146.0,WB
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834,1.869859,,158.0,WB


# Merging

### The dichotamised economic dataframe is merged with the femicide dataframe on the shared iso3 index

In [669]:
df_fem = df_fem.set_index(['iso3', 'year'])


In [670]:
df_fem.index.dtypes

iso3            object
year    datetime64[ns]
dtype: object

In [671]:
df = df_fem.merge(df_eco, how='left', left_index=True, right_index=True).rename(columns={'country_x' : 'country'}).drop(columns=['country_y'])
df = df.sort_index()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,region,subregion,country,indicator,disaggregation,gender,source_x,unit,value,footnote,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate,suicide_count,femicide_count,source_y
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ABW,1990-01-01,America,Caribbean,Aruba,Homicide: # of victims,-,Female,MD,Count,0.0,,11570.0,,,,33480.0,0.000000,,0.0,WB
ABW,1995-01-01,America,Caribbean,Aruba,Homicide: # of victims,-,Female,MD,Count,0.0,,16930.0,,,,39724.0,0.000000,,0.0,WB
ABW,1999-01-01,America,Caribbean,Aruba,Homicide: # of victims,-,Female,MD,Count,1.0,,18320.0,,,,45050.0,2.219731,,1.0,WB
ABW,2001-01-01,America,Caribbean,Aruba,Homicide: # of victims,-,Female,MD,Count,2.0,,20510.0,,,,47178.0,4.239264,,2.0,WB
ABW,2002-01-01,America,Caribbean,Aruba,Homicide: # of victims,-,Female,MD,Count,1.0,,19290.0,,,,47831.0,2.090672,,1.0,WB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWE,1990-01-01,Africa,Eastern Africa,Zimbabwe,Homicide: # of victims,-,Female,MD,Count,183.0,,740.0,,,,5247850.0,3.487142,,183.0,WB
ZWE,2018-01-01,Africa,Eastern Africa,Zimbabwe,Homicide: # of victims,-,Female,WB,Count,163.0,,1550.0,1700.0,8.7,,7966181.0,2.046150,693.0,163.0,WB
ZWE,2019-01-01,Africa,Eastern Africa,Zimbabwe,Homicide: # of victims,-,Female,WB,Count,167.0,,1450.0,1900.0,8.8,50.3,8122618.0,2.055987,715.0,167.0,WB
ZWE,2020-01-01,Africa,Eastern Africa,Zimbabwe,Homicide: # of victims,-,Female,WB,Count,146.0,,1460.0,1700.0,,,8284447.0,1.762339,,146.0,WB
