# Dataset Creation

In [1]:
import pandas as pd
import numpy as np
import requests
import io
import wbgapi as wb
from bs4 import BeautifulSoup
import cache_magic #pip install ipython_cache
import country_converter as coco

%cache magic is now registered in ipython


# Femicide Data

### Femicide data is downloaded from github and loaded as df_fem

In [2]:
url = 'https://raw.githubusercontent.com/Tom-Whittington/Femicide_analysis/main/processed_data/original_femicide_data.csv'

In [3]:
%cache download = requests.get(url).content

loading cached value for variable 'download'. Time since pickling  150 days, 21:51:09.786785


In [4]:
df_fem_old = pd.read_csv(io.StringIO(download.decode('utf-8')),
                     usecols =['Region',
                               'Subregion',
                               'country',
                               'iso3_code',
                               'Disaggregation',
                               'Gender',
                               'Source',
                               'Unit',
                               'Year',
                               'Value'],
                     dtype={'iso3_code' : 'category',
                           'Region' : 'category',
                           'Subregion' : 'category',
                           'country' : 'category',
                           'Gender' : 'category',
                           'Source' : 'category',
                           'Unit' : 'category',
                          'Year' : 'string'})


In [5]:
df_fem_old

Unnamed: 0,Region,Subregion,country,iso3_code,Disaggregation,Gender,Source,Unit,Year,Value
0,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2009,1115
1,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,"Rate per 100,000 population",2009,3.93
2,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2010,983
3,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,"Rate per 100,000 population",2010,3.37
4,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2011,1231
...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,"Rate per 100,000 population",2006,8.82
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,Count,2010,711
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,"Rate per 100,000 population",2010,5.6
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,Count,2012,981


In [6]:
df_fem_old.dtypes

Region            category
Subregion         category
country           category
iso3_code         category
Disaggregation      object
Gender            category
Source            category
Unit              category
Year                string
Value               object
dtype: object

### Columns are renamed

In [7]:
df_fem_old.columns = [col.lower() for col in df_fem_old.columns]
df_fem_old = df_fem_old.rename(columns={'iso3_code' : 'iso3'})
df_fem_old['year'] = pd.to_datetime(df_fem_old['year'], format='%Y')
df_fem_old['iso3'] = df_fem_old['iso3'].replace({'KOS' : 'XKX'})
df_fem_old

Unnamed: 0,region,subregion,country,iso3,disaggregation,gender,source,unit,year,value
0,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2009-01-01,1115
1,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,"Rate per 100,000 population",2009-01-01,3.93
2,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2010-01-01,983
3,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,"Rate per 100,000 population",2010-01-01,3.37
4,Asia,Southern Asia,Afghanistan,AFG,-,Total (all ages),NSO,Count,2011-01-01,1231
...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,"Rate per 100,000 population",2006-01-01,8.82
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,Count,2010-01-01,711
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,"Rate per 100,000 population",2010-01-01,5.6
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,-,Total (all ages),WHO,Count,2012-01-01,981


In [8]:
df_ipv = df_fem_old[df_fem_old['disaggregation'].str.contains('Killed')]
df_ipv = df_ipv.set_index(['iso3','year'])
df_ipv = df_ipv.replace({'Killed by Intimate partner or family member (IPFM)' : 'IPFM',
                        'Killed by Intimate partner' : 'IP'})

df_ipv = df_ipv.dropna(subset='value')
df_ipv

Unnamed: 0_level_0,Unnamed: 1_level_0,region,subregion,country,disaggregation,gender,source,unit,value
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ALB,2005-01-01,Europe,Southern Europe,Albania,IPFM,All,CTS,Count,19
ALB,2005-01-01,Europe,Southern Europe,Albania,IPFM,All,CTS,"Rate per 100,000 population",0.62
ALB,2005-01-01,Europe,Southern Europe,Albania,IPFM,Male,CTS,Count,12
ALB,2005-01-01,Europe,Southern Europe,Albania,IPFM,Male,CTS,"Rate per 100,000 population",0.77
ALB,2005-01-01,Europe,Southern Europe,Albania,IPFM,Female,CTS,Count,7
...,...,...,...,...,...,...,...,...,...
VUT,2020-01-01,Oceania,Melanesia,Vanuatu,IPFM,All,External,"Rate per 100,000 population",0.33
VUT,2020-01-01,Oceania,Melanesia,Vanuatu,IPFM,Male,External,Count,0
VUT,2020-01-01,Oceania,Melanesia,Vanuatu,IPFM,Male,External,"Rate per 100,000 population",0
VUT,2020-01-01,Oceania,Melanesia,Vanuatu,IPFM,Female,External,Count,1


### Filters out totals and only keeps female data

In [9]:
df_fem_old = df_fem_old[(df_fem_old['disaggregation'] == '-') & (df_fem_old['gender'] == 'Female')]
df_fem_old = df_fem_old.set_index(['iso3', 'year'])
df_fem_old

Unnamed: 0_level_0,Unnamed: 1_level_0,region,subregion,country,disaggregation,gender,source,unit,value
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AFG,2015-01-01,Asia,Southern Asia,Afghanistan,-,Female,NSO,Count,93
AFG,2015-01-01,Asia,Southern Asia,Afghanistan,-,Female,NSO,"Rate per 100,000 population",0.56
AFG,2016-01-01,Asia,Southern Asia,Afghanistan,-,Female,NSO,Count,101
AFG,2016-01-01,Asia,Southern Asia,Afghanistan,-,Female,NSO,"Rate per 100,000 population",0.59
AFG,2017-01-01,Asia,Southern Asia,Afghanistan,-,Female,NSO,Count,133
...,...,...,...,...,...,...,...,...,...
VEN,2016-01-01,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),-,Female,CTS,"Rate per 100,000 population",5.36
VEN,2017-01-01,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),-,Female,CTS,Count,788
VEN,2017-01-01,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),-,Female,CTS,"Rate per 100,000 population",5.31
ZWE,1990-01-01,Africa,Sub-Saharan Africa,Zimbabwe,-,Female,MD,Count,183


### Pivots counts and rates into separate columns

In [10]:
df_fem_old = df_fem_old.pivot(columns = 'unit', values='value')
df_fem_old = df_fem_old.rename_axis(None, axis=1) 
df_fem_old

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,"Rate per 100,000 population"
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AFG,2015-01-01,93,0.56
AFG,2016-01-01,101,0.59
AFG,2017-01-01,133,0.75
AFG,2018-01-01,153,0.85
ALB,1992-01-01,14,0.87
...,...,...,...
VEN,2012-01-01,940,6.39
VEN,2015-01-01,731,4.83
VEN,2016-01-01,806,5.36
VEN,2017-01-01,788,5.31


### Renames and sets datatypes of new columns

In [11]:
df_fem_old = df_fem_old.rename(columns={'Count' : 'femicide_count',
                                       'Rate per  100,000 population' : 'femicide_rate'})

df_fem_old = df_fem_old.apply(pd.to_numeric, errors='coerce')
df_fem_old

Unnamed: 0_level_0,Unnamed: 1_level_0,femicide_count,femicide_rate
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AFG,2015-01-01,93.0,0.56
AFG,2016-01-01,101.0,0.59
AFG,2017-01-01,133.0,0.75
AFG,2018-01-01,153.0,0.85
ALB,1992-01-01,14.0,0.87
...,...,...,...
VEN,2012-01-01,940.0,6.39
VEN,2015-01-01,731.0,4.83
VEN,2016-01-01,806.0,5.36
VEN,2017-01-01,788.0,5.31


# World bank data

In [12]:
columns ={'economy':'iso3',
          'Time':'year',
          'NY.GNP.PCAP.CD': 'gnipc',
          'SI.POV.GINI':'gini',
          'Country':'country',
          'SP.POP.TOTL.FE.IN' : 'fem_pop',
          'SH.STA.SUIC.FE.P5' : 'suicide_rate',
          'SH.MMR.DTHS' : 'mat_deaths',
          'VC.IHR.PSRC.FE.P5' : 'femicide_rate',
          'SP.HOU.FEMA.ZS' : '%house_head',
          'SP.DYN.LE60.FE.IN' : 'life_exp60',
          'SG.LAW.EQRM.WK' : 'eq_renum',
          'SG.LAW.NODC.HR' : 'prohib_discr',
          'SG.HLD.HEAD.EQ' : 'allow_house_head',
          'SG.GET.JOBS.EQ' : 'allow_job',
          'SG.LEG.DVAW' : 'dv_law',
          'SG.VAW.1549.ME.ZS' : '%dv_1y',
          'SH.PRV.SMOK.FE' : '%smoke',
          'SH.STA.OB18.FE.ZS' : '%obese',
          'SH.ALC.PCAP.FE.LI' : 'alc_vol',
          'SG.DMK.PRCH.FN.ZS' : '%big_buy'
          
         }

### Data is imported from worldbank API


1. indicators can be selected in the list below

2. Years 2000-2023 are selected

3. Labels also returns country code

4. skipBlanks is turned off so we have a row for each year (easier merging)

5. each series is added to column

6. skipAggs returns only countries and not regions/aggregations

7. Result is cached as can be slow to return

In [None]:
indicators = [
    'NY.GNP.PCAP.CD' # Gross national income per capita
    
    ,'SI.POV.GINI' # Gini coefficient
    
    ,'SP.POP.TOTL.FE.IN' # Female population
    
    ,'SH.STA.SUIC.FE.P5' # Suicide mortality rate, female (per 100,000 female)
    
    ,'SH.MMR.DTHS' # Number of maternal deaths (raw)
    
    ,'VC.IHR.PSRC.FE.P5' # Intentional homicides, female (per 100,000 female)
    
#    ,'SP.HOU.FEMA.ZS' # Female headed households (% of households with a female head)
    
#    ,'SP.DYN.LE60.FE.IN' # Life expectancy at age 60, female (years)
    
#    ,'SG.LAW.EQRM.WK' # Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)
    
#    ,'SG.LAW.NODC.HR' # The law prohibits discrimination in employment based on gender (1=yes; 0=no)
    
#    ,'SG.HLD.HEAD.EQ' # A woman can be head of household in the same way as a man (1=yes; 0=no)
    
#    ,'SG.GET.JOBS.EQ' # A woman can get a job in the same way as a man (1=yes; 0=no)
    
#    ,'SG.LEG.DVAW' # There is legislation specifically addressing domestic violence (1=yes; 0=no)
    
#    ,'SG.VAW.1549.ME.ZS' # Proportion of women subjected to physical and/or sexual violence in the last 12 months (modeled estimate, % of ever partnered women ages 15-49)
    
#    ,'SH.PRV.SMOK.FE' # Prevalence of current tobacco use, females (% of female adults)'
    
#    ,'SH.STA.OB18.FE.ZS' # Prevalence of obesity, female (% of female population ages 18+)
    
#    ,'SH.ALC.PCAP.FE.LI' # Total alcohol consumption per capita, female (liters of pure alcohol, projected estimates, female 15+ years of age)'
    
#    ,'SG.DMK.PRCH.FN.ZS' # Women participating in making major household purchase decisions (% of women age 15-49)
             ]

In [None]:
%cache df_wb = wb.data.DataFrame(indicators, time=range(1990,2022), labels=True, skipBlanks =False, columns='series', skipAggs=True)

### Index is reset, columns renamed and converted to datetime format. Extra time column is dropped and index is set to country code and year

In [None]:
df_wb = df_wb.reset_index()

df_wb = df_wb.rename(columns=columns)
df_wb['year'] = pd.to_datetime(df_wb['year'], format='%Y')

df_wb = df_wb.drop(columns='time')

df_wb = df_wb[~df_wb['iso3'].str.contains('CHI')]

df_wb = df_wb.set_index(['iso3', 'year'])

df_wb

In [None]:
df_wb['fem_pop'] = df_wb['fem_pop'].astype('int64')

In [None]:
df = df_wb.copy()

In [None]:
df.isna().sum()

In [None]:
df['suicide_count'] =round((df['suicide_rate'] / 100000) * df['fem_pop'], 0)
df['femicide_count'] = round((df['femicide_rate'] / 100000) * df['fem_pop'], 0)
df['femicide_count'] = df['femicide_count'].fillna(df_fem_old['femicide_count'])
df['femicide_rate'] = df['femicide_rate'].fillna(df_fem_old['femicide_rate'])

In [None]:
df_extra_countries = df_fem_old[~df_fem_old.index.isin(df.index)]
df = pd.concat([df, df_extra_countries])

In [None]:
df.isna().sum()

In [None]:
df = df.reset_index()
df

Filter out all rows that are already in the previous dataset. Important to note that some years have differences in the values and frequently 2021 is significantly different (maybe due to reporting being done mid year?)

In [None]:
converter = coco.CountryConverter()
df['continent'] = converter.convert(names=df['iso3'], src="ISO3", to="continent")
df['region'] = converter.convert(names=df['iso3'], src="ISO3", to="UNregion")
df['country'] = converter.convert(names=df['iso3'], src="ISO3", to="name_short")

In [None]:
df = df.set_index(['iso3', 'year']).sort_index()
df = df[['continent',
         'region',
         'country',
         'fem_pop',
         'femicide_count',
         'femicide_rate',
         'suicide_rate',
         'gnipc',
         'gini',
        'mat_deaths']]
df

## CIA data

### Gets webpage from url

In [None]:
url = 'https://www.cia.gov/the-world-factbook/field/gini-index-coefficient-distribution-of-family-income/country-comparison'

%cache r = requests.get(url)

soup = BeautifulSoup(r.text, 'lxml')

### Finds first table on page, extracts headers from first row and then loops over table and creates dataframe from values

In [None]:
table = soup.findAll('table')[0]

headers = [x.text for x in table.findChildren('th')[1:]]

row_list = headers
table_list = []
for row in table.findChildren('tr'):

    for cell in row.findChildren('td')[1:]:
        row_list.append(cell.text)
    table_list.append(row_list)
    row_list = []

df_cia = pd.DataFrame(table_list)

### Adds gini label to first row and then promotes first row to headers 

In [None]:
df_cia.iloc[0,1]='gini'

df_cia.columns= df_cia.iloc[0,:]

df_cia = df_cia[1:]

df_cia

### iso3 codes are generated for CIA data set by the worldbank api, Nas are dropped, year and iso3 set as index ready for filling in blanks 

In [None]:
df_cia['iso3'] = wb.economy.coder(df_cia['Country'])
df_cia = df_cia.dropna(subset='iso3')
df_cia = df_cia.copy()
df_cia['year'] =df_cia['Date of Information'].str[:4]
df_cia = df_cia[['iso3', 'year', 'Country', 'gini']]
df_cia['year'] = pd.to_datetime(df_cia['year'], format='%Y')
df_cia = df_cia.set_index(['iso3', 'year'])
df_cia

In [None]:
df.isna().sum()

### Fills in any empty cells in wb dataset with values from cia dataset ~17 extra countries. gini column set back to float

In [None]:
df['gini'] = df['gini'].fillna(df_cia['gini'])
df

In [None]:
df.isna().sum()

In [None]:
df.to_csv('test.csv')

femicide count globally over time, suicide count
maps of femicide rate with year slider
mean femicide rate over decades, see biggest gains/losses

regression time series regression analysis on femicide rate compared with other variables. 
logistic regression on action plans

worldbank used in articles datasets

report on how dataset was created, why we did analyses 