# Dataset Creation

In [1]:
import pandas as pd
import numpy as np
import requests
import io
import wbgapi as wb
from bs4 import BeautifulSoup
import cache_magic #pip install ipython_cache
import country_converter as coco

%cache magic is now registered in ipython


# Femicide Data

### Femicide data is downloaded from github and loaded as df_fem

In [2]:
url = 'https://raw.githubusercontent.com/Tom-Whittington/Femicide_analysis/main/processed_data/original_femicide_data.csv'

In [3]:
%cache download = requests.get(url).content

loading cached value for variable 'download'. Time since pickling  149 days, 21:55:12.330008


In [4]:
df_fem_old = pd.read_csv(io.StringIO(download.decode('utf-8')),
                     usecols =['Region',
                               'Subregion',
                               'country',
                               'iso3_code',
                               'Disaggregation',
                               'Gender',
                               'Source',
                               'Unit',
                               'Year',
                               'Value'],
                     dtype={'iso3_code' : 'category',
                           'Region' : 'category',
                           'Subregion' : 'category',
                           'country' : 'category',
                           'Gender' : 'category',
                           'Source' : 'category',
                           'Unit' : 'category',
                          'Year' : 'string'})


In [5]:
df_fem_old

Unnamed: 0,Region,Subregion,country,iso3_code,Disaggregation,Gender,Source,Unit,Year,Value
0,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2009,1115
1,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,"Rate per 100,000 population",2009,3.93
2,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2010,983
3,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,"Rate per 100,000 population",2010,3.37
4,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2011,1231
...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,"Rate per 100,000 population",2006,8.82
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,Count,2010,711
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,"Rate per 100,000 population",2010,5.6
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,Count,2012,981


In [6]:
df_fem_old.dtypes

Region            category
Subregion         category
country           category
iso3_code         category
Disaggregation      object
Gender            category
Source            category
Unit              category
Year                string
Value               object
dtype: object

### Columns are renamed

In [7]:
df_fem_old.columns = [col.lower() for col in df_fem_old.columns]
df_fem_old = df_fem_old.rename(columns={'iso3_code' : 'iso3'})
df_fem_old['year'] = pd.to_datetime(df_fem_old['year'], format='%Y')

In [8]:
df_fem_old

Unnamed: 0,region,subregion,country,iso3,disaggregation,gender,source,unit,year,value
0,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2009-01-01,1115
1,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,"Rate per 100,000 population",2009-01-01,3.93
2,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2010-01-01,983
3,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,"Rate per 100,000 population",2010-01-01,3.37
4,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2011-01-01,1231
...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,"Rate per 100,000 population",2006-01-01,8.82
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,Count,2010-01-01,711
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,"Rate per 100,000 population",2010-01-01,5.6
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,Count,2012-01-01,981


### Filters out totals and only keeps male and female data

In [9]:
df_fem_old = df_fem_old.loc[df_fem_old['gender'] == 'Female']
df_fem_old = df_fem_old.drop(columns='gender')
df_fem_old

Unnamed: 0,region,subregion,country,iso3,disaggregation,source,unit,year,value
8,Asia,Southern Asia,Afghanistan,AFG,-,NSO,Count,2015-01-01,93
9,Asia,Southern Asia,Afghanistan,AFG,-,NSO,"Rate per 100,000 population",2015-01-01,0.56
14,Asia,Southern Asia,Afghanistan,AFG,-,NSO,Count,2016-01-01,101
15,Asia,Southern Asia,Afghanistan,AFG,-,NSO,"Rate per 100,000 population",2016-01-01,0.59
20,Asia,Southern Asia,Afghanistan,AFG,-,NSO,Count,2017-01-01,133
...,...,...,...,...,...,...,...,...,...
24285,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,-,CTS,"Rate per 100,000 population",2016-01-01,5.36
24290,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,-,CTS,Count,2017-01-01,788
24291,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,-,CTS,"Rate per 100,000 population",2017-01-01,5.31
24372,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,MD,Count,1990-01-01,183


### Filters out counts and leaves only count, sorts values and converts value column to numeric

In [10]:
df_fem_old = df_fem_old[df_fem_old['unit'] != 'Rate per  100,000 population']
df_fem_old = df_fem_old.drop(columns='unit')
df_fem_old['value'] = pd.to_numeric(df_fem_old['value'], errors='coerce')
df_fem_old['iso3'] = df_fem_old['iso3'].replace({'KOS' : 'XKX'})
df_fem_old = df_fem_old.sort_values(['iso3', 'year'])
df_fem_old

Unnamed: 0,region,subregion,country,iso3,disaggregation,source,year,value
1192,Americas,Latin America and the Caribbean,Aruba,ABW,-,MD,1990-01-01,0.0
1198,Americas,Latin America and the Caribbean,Aruba,ABW,-,MD,1995-01-01,0.0
1204,Americas,Latin America and the Caribbean,Aruba,ABW,-,MD,1999-01-01,1.0
1210,Americas,Latin America and the Caribbean,Aruba,ABW,-,MD,2001-01-01,2.0
1216,Americas,Latin America and the Caribbean,Aruba,ABW,-,MD,2002-01-01,1.0
...,...,...,...,...,...,...,...,...
21172,Africa,Sub-Saharan Africa,South Africa,ZAF,-,NP,2011-01-01,2399.0
21188,Africa,Sub-Saharan Africa,South Africa,ZAF,-,NP,2016-01-01,2639.0
21194,Africa,Sub-Saharan Africa,South Africa,ZAF,-,NP,2017-01-01,2930.0
21198,Africa,Sub-Saharan Africa,South Africa,ZAF,-,NP,2018-01-01,2771.0


In [11]:
df_ipv = df_fem_old[df_fem_old['disaggregation'].str.contains('Killed')]
df_ipv = df_ipv.set_index(['iso3','year'])
df_ipv = df_ipv.replace({'Killed by Intimate partner or family member (IPFM)' : 'IPFM',
                        'Killed by Intimate partner' : 'IP'})

df_ipv = df_ipv.dropna(subset='value')

In [12]:
df_ipv

Unnamed: 0_level_0,Unnamed: 1_level_0,region,subregion,country,disaggregation,source,value
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ALB,2005-01-01,Europe,Southern Europe,Albania,IPFM,CTS,7.0
ALB,2006-01-01,Europe,Southern Europe,Albania,IPFM,CTS,14.0
ALB,2007-01-01,Europe,Southern Europe,Albania,IPFM,CTS,8.0
ALB,2008-01-01,Europe,Southern Europe,Albania,IPFM,CTS,5.0
ALB,2009-01-01,Europe,Southern Europe,Albania,IPFM,CTS,7.0
...,...,...,...,...,...,...,...
VCT,2007-01-01,Americas,Latin America and the Caribbean,Saint Vincent and the Grenadines,IPFM,CTS,6.0
VCT,2008-01-01,Americas,Latin America and the Caribbean,Saint Vincent and the Grenadines,IPFM,CTS,3.0
VCT,2009-01-01,Americas,Latin America and the Caribbean,Saint Vincent and the Grenadines,IPFM,CTS,1.0
VCT,2012-01-01,Americas,Latin America and the Caribbean,Saint Vincent and the Grenadines,IP,CTS,1.0


In [13]:
df_fem_old = df_fem_old[df_fem_old['disaggregation'] == '-']
df_fem_old = df_fem_old.set_index(['iso3','year'])
df_fem_old = df_fem_old.drop(columns='disaggregation')

In [14]:
df_fem_old

Unnamed: 0_level_0,Unnamed: 1_level_0,region,subregion,country,source,value
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABW,1990-01-01,Americas,Latin America and the Caribbean,Aruba,MD,0.0
ABW,1995-01-01,Americas,Latin America and the Caribbean,Aruba,MD,0.0
ABW,1999-01-01,Americas,Latin America and the Caribbean,Aruba,MD,1.0
ABW,2001-01-01,Americas,Latin America and the Caribbean,Aruba,MD,2.0
ABW,2002-01-01,Americas,Latin America and the Caribbean,Aruba,MD,1.0
...,...,...,...,...,...,...
ZAF,2011-01-01,Africa,Sub-Saharan Africa,South Africa,NP,2399.0
ZAF,2016-01-01,Africa,Sub-Saharan Africa,South Africa,NP,2639.0
ZAF,2017-01-01,Africa,Sub-Saharan Africa,South Africa,NP,2930.0
ZAF,2018-01-01,Africa,Sub-Saharan Africa,South Africa,NP,2771.0


# Economic data

## Worldbank data import

### Economic data is imported from worldbank API
1. NY.GNP.PCAP.CD represents gross national income per capita

2. SI.POV.GINI represents the gini coefficient of each country

3. Years 2000-2023 are selected

4. Labels also returns country code

5. skipBlanks is turned off so we have a row for each year (easier merging)

6. each series is added to column

7. skipAggs returns only countries and not regions/aggregations

8. Result is cached as can be slow to return

9. NY.GNP.PCAP.CD = Gross national income per capita
10. SI.POV.GINI = GINI coefficient
11. VC.IHR.PSRC.FE.P5 = Intentional homicides, female (per 100,000 female)
12. SH.STA.SUIC.FE.P5 = Suicide mortality rate, female (per 100,000 female population)
13. SH.MMR.DTHS = Number of maternal deaths (raw)
14. SH.PRV.SMOK.FE = Prevalence of current tobacco use, females (% of female adults)
15. SP.POP.TOTL.FE.IN = Population, female

In [15]:
%cache df_wb = wb.data.DataFrame(['NY.GNP.PCAP.CD','SI.POV.GINI', 'SP.POP.TOTL.FE.IN', 'SH.STA.SUIC.FE.P5', 'SH.MMR.DTHS', 'VC.IHR.PSRC.FE.P5'], time=range(1990,2022), labels=True, skipBlanks =False, columns='series', skipAggs=True)

loading cached value for variable 'df_wb'. Time since pickling  1 day, 4:35:42.572864


### Index is reset, columns renamed and converted to datetime format. Extra time column is dropped and index is set to country code and year

In [16]:
df_wb = df_wb.reset_index()

df_wb = df_wb.rename(columns={'economy':'iso3',
                        'Time':'year',
                        'NY.GNP.PCAP.CD':'gnipc',
                        'SI.POV.GINI':'gini',
                        'Country':'country',
                        'SP.POP.TOTL.FE.IN' : 'fem_pop',
                        'SH.STA.SUIC.FE.P5' : 'suicide_rate',
                        'SH.MMR.DTHS' : 'mat_deaths',
                        'VC.IHR.PSRC.FE.P5' : 'femicide_rate'})

df_wb['year'] = pd.to_datetime(df_wb['year'], format='%Y')

df_wb['gnipc'] = pd.to_numeric(df_wb['gnipc'], errors='coerce')
df_wb['gini'] = pd.to_numeric(df_wb['gini'], errors='coerce')

df_wb = df_wb.drop(columns='time')

df_wb = df_wb.set_index(['iso3', 'year'])
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834.0,1.869859
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447.0,1.762339
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618.0,2.055987
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181.0,2.046150
ZWE,2017-01-01,Zimbabwe,1170.0,1800.0,9.6,44.3,7810471.0,
...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,7722096.0,
AFG,1993-01-01,Afghanistan,,,,,7000119.0,
AFG,1992-01-01,Afghanistan,,,,,6028939.0,
AFG,1991-01-01,Afghanistan,,,,,5372208.0,


In [17]:
df_wb['fem_pop'] = df_wb['fem_pop'].astype('int64')

In [18]:
df_wb.isna().sum()

country             0
gnipc            1073
mat_deaths       3059
suicide_rate     3284
gini             5137
fem_pop             0
femicide_rate    4405
dtype: int64

In [19]:
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834,1.869859
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447,1.762339
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618,2.055987
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181,2.046150
ZWE,2017-01-01,Zimbabwe,1170.0,1800.0,9.6,44.3,7810471,
...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,7722096,
AFG,1993-01-01,Afghanistan,,,,,7000119,
AFG,1992-01-01,Afghanistan,,,,,6028939,
AFG,1991-01-01,Afghanistan,,,,,5372208,


In [20]:
df_wb['suicide_count'] =round((df_wb['suicide_rate'] / 100000) * df_wb['fem_pop'], 0)
df_wb['femicide_count'] = round((df_wb['femicide_rate'] / 100000) * df_wb['fem_pop'], 0)


In [21]:
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate,suicide_count,femicide_count
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834,1.869859,,158.0
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447,1.762339,,146.0
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618,2.055987,715.0,167.0
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181,2.046150,693.0,163.0
ZWE,2017-01-01,Zimbabwe,1170.0,1800.0,9.6,44.3,7810471,,750.0,
...,...,...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,7722096,,,
AFG,1993-01-01,Afghanistan,,,,,7000119,,,
AFG,1992-01-01,Afghanistan,,,,,6028939,,,
AFG,1991-01-01,Afghanistan,,,,,5372208,,,


In [22]:
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,mat_deaths,suicide_rate,gini,fem_pop,femicide_rate,suicide_count,femicide_count
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ZWE,2021-01-01,Zimbabwe,1530.0,,,,8449834,1.869859,,158.0
ZWE,2020-01-01,Zimbabwe,1460.0,1700.0,,,8284447,1.762339,,146.0
ZWE,2019-01-01,Zimbabwe,1450.0,1900.0,8.8,50.3,8122618,2.055987,715.0,167.0
ZWE,2018-01-01,Zimbabwe,1550.0,1700.0,8.7,,7966181,2.046150,693.0,163.0
ZWE,2017-01-01,Zimbabwe,1170.0,1800.0,9.6,44.3,7810471,,750.0,
...,...,...,...,...,...,...,...,...,...,...
AFG,1994-01-01,Afghanistan,,,,,7722096,,,
AFG,1993-01-01,Afghanistan,,,,,7000119,,,
AFG,1992-01-01,Afghanistan,,,,,6028939,,,
AFG,1991-01-01,Afghanistan,,,,,5372208,,,


In [23]:
df_wb_femicide =df_wb.reset_index()
df_wb = df_wb.drop(columns='femicide_count')
df_wb_femicide = df_wb_femicide[['iso3', 'year', 'femicide_count']]
df_wb_femicide['source'] = 'WB'
df_wb_femicide = df_wb_femicide.set_index(['iso3', 'year'])
df_wb_femicide

Unnamed: 0_level_0,Unnamed: 1_level_0,femicide_count,source
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZWE,2021-01-01,158.0,WB
ZWE,2020-01-01,146.0,WB
ZWE,2019-01-01,167.0,WB
ZWE,2018-01-01,163.0,WB
ZWE,2017-01-01,,WB
...,...,...,...
AFG,1994-01-01,,WB
AFG,1993-01-01,,WB
AFG,1992-01-01,,WB
AFG,1991-01-01,,WB


In [24]:
df_wb_femicide = df_wb_femicide.dropna()
df_wb_femicide

Unnamed: 0_level_0,Unnamed: 1_level_0,femicide_count,source
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZWE,2021-01-01,158.0,WB
ZWE,2020-01-01,146.0,WB
ZWE,2019-01-01,167.0,WB
ZWE,2018-01-01,163.0,WB
ZWE,1990-01-01,183.0,WB
...,...,...,...
ALB,1994-01-01,12.0,WB
ALB,1993-01-01,16.0,WB
ALB,1992-01-01,14.0,WB
AFG,2019-01-01,238.0,WB


Filter out all rows that are already in the previous dataset. Important to note that some years have differences in the values and frequently 2021 is significantly different (maybe due to reporting being done mid year?)

In [25]:
df_fem = df_fem[~df_fem.index.isin(df_wb_femicide.index)]
df_fem = df_fem.rename(columns={'value' : 'femicide_count'})
df_fem

NameError: name 'df_fem' is not defined

In [None]:
df_fem = pd.concat([df_fem, df_wb_femicide])
df_fem = df_fem.reset_index()
df_fem[['gender', 'disaggregation', 'indicator', 'unit']] = ['Female', '-', 'Homicide: # of victims', 'Count']
df_fem

In [None]:
converter = coco.CountryConverter()
df_fem['region'] = converter.convert(names=df_fem['iso3'], src="ISO3", to="continent")
df_fem['subregion'] = converter.convert(names=df_fem['iso3'], src="ISO3", to="UNregion")
df_fem['country'] = converter.convert(names=df_fem['iso3'], src="ISO3", to="name_short")

In [None]:
df_fem = df_fem.dropna(subset='femicide_count')
df_fem

## CIA data

### Gets webpage from url

In [None]:
url = 'https://www.cia.gov/the-world-factbook/field/gini-index-coefficient-distribution-of-family-income/country-comparison'

%cache r = requests.get(url)

soup = BeautifulSoup(r.text, 'lxml')

### Finds first table on page, extracts headers from first row and then loops over table and creates dataframe from values

In [None]:
table = soup.findAll('table')[0]

headers = [x.text for x in table.findChildren('th')[1:]]

row_list = headers
table_list = []
for row in table.findChildren('tr'):

    for cell in row.findChildren('td')[1:]:
        row_list.append(cell.text)
    table_list.append(row_list)
    row_list = []

df_cia = pd.DataFrame(table_list)

### Adds gini label to first row and then promotes first row to headers 

In [None]:
df_cia.iloc[0,1]='gini'

df_cia.columns= df_cia.iloc[0,:]

df_cia = df_cia[1:]

df_cia

### iso3 codes are generated for CIA data set by the worldbank api, Nas are dropped, year and iso3 set as index ready for filling in blanks 

In [None]:
df_cia['iso3'] = wb.economy.coder(df_cia['Country'])
df_cia = df_cia.dropna(subset='iso3')
df_cia['year'] =df_cia['Date of Information'].str[:4]
df_cia = df_cia[['iso3', 'year', 'Country', 'gini']]
df_cia['year'] = pd.to_datetime(df_cia['year'], format='%Y')
df_cia = df_cia.set_index(['iso3', 'year'])
df_cia

In [None]:
df_wb.isna().sum()

In [None]:
df_wb

### Fills in any empty cells in wb dataset with values from cia dataset ~17 extra countries. gini column set back to float

In [None]:
df_wb['gini'] = df_wb['gini'].fillna(df_cia['gini'])

df_wb = df_wb.sort_index()

df_wb['gini'] =df_wb['gini'].astype('float64')

df_wb

In [None]:
df_wb.isna().sum()

# Merging

### The dichotamised economic dataframe is merged with the femicide dataframe on the shared iso3 index

In [None]:
df_fem = df_fem.set_index(['iso3', 'year'])


In [None]:
df_fem.index.dtypes

In [None]:
df = df_fem.merge(df_wb, how='left', left_index=True, right_index=True).rename(columns={'country_x' : 'country'}).drop(columns=['country_y'])
df = df.sort_index()
df

In [None]:
df.isna().sum()

In [None]:
df.to_csv('test.csv')

femicide count globally over time, suicide count
maps of femicide rate with year slider
mean femicide rate over decades, see biggest gains/losses

regression time series regression analysis on femicide rate compared with other variables. 
logistic regression on action plans

worldbank used in articles datasets

report on how dataset was created, why we did analyses 