# Dataset Creation

In [1]:
import pandas as pd
import numpy as np
import requests
import io
import wbgapi as wb
from bs4 import BeautifulSoup
import cache_magic #pip install ipython_cache

%cache magic is now registered in ipython


# Femicide Data

### Femicide data is downloaded from github and loaded as df_fem

In [2]:
url = 'https://raw.githubusercontent.com/Tom-Whittington/Femicide_analysis/main/processed_data/original_femicide_data.csv'

In [3]:
%cache download = requests.get(url).content

loading cached value for variable 'download'. Time since pickling  1 day, 19:43:41.015817


In [4]:
df_fem = pd.read_csv(io.StringIO(download.decode('utf-8')),
                dtype={'iso3_code' : 'category',
                           'Region' : 'category',
                           'Subregion' : 'category',
                           'country' : 'category',
                           'Indicator' : 'category',
                           'Disaggregation' : 'category',
                           'Gender' : 'category',
                           'Source' : 'category',
                           'Unit' : 'category',
                          'Year' : 'string'})

In [5]:
df_fem

Unnamed: 0,Region,Subregion,country,iso3_code,Indicator,Disaggregation,Gender,Source,Unit,Year,Value,Footnote
0,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2009,1115,
1,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2009,3.93,
2,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2010,983,
3,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,"Rate per 100,000 population",2010,3.37,
4,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Total (all ages),NSO,Count,2011,1231,
...,...,...,...,...,...,...,...,...,...,...,...,...
24387,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2006,8.82,
24388,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2010,711,
24389,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,"Rate per 100,000 population",2010,5.6,
24390,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Total (all ages),WHO,Count,2012,981,


In [6]:
df_fem.dtypes

Region            category
Subregion         category
country           category
iso3_code         category
Indicator         category
Disaggregation    category
Gender            category
Source            category
Unit              category
Year                string
Value               object
Footnote            object
dtype: object

### Columns are renamed

In [7]:
df_fem.columns = [col.lower() for col in df_fem.columns]
df_fem = df_fem.rename(columns={'iso3_code' : 'iso3'})
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
24338,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,Count,1990,550.0,
24339,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,"Rate per 100,000 population",1990,6.84,
24340,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,Count,1991,684.0,
24341,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,"Rate per 100,000 population",1991,8.29,
24342,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,Count,1992,768.0,
24343,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,"Rate per 100,000 population",1992,9.09,
24344,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,Count,1993,745.0,
24345,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,"Rate per 100,000 population",1993,8.61,
24346,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,Count,1994,831.0,
24347,Africa,Sub-Saharan Africa,Zambia,ZMB,Homicide: # of victims,-,Total (all ages),UNSDC,"Rate per 100,000 population",1994,9.37,


### Filters out totals and only keeps male and female data

In [8]:
df_fem = df_fem.loc[(df_fem['gender'] == 'Male') | (df_fem['gender'] == 'Female')]
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
8,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2015,93,
9,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,"Rate per 100,000 population",2015,0.56,
10,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Male,NSO,Count,2015,3274,
11,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Male,NSO,"Rate per 100,000 population",2015,18.51,
14,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2016,101,
...,...,...,...,...,...,...,...,...,...,...,...,...
24293,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,Homicide: # of victims,-,Male,CTS,"Rate per 100,000 population",2017,93.88,
24372,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Female,MD,Count,1990,183,
24373,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Female,MD,"Rate per 100,000 population",1990,3.49,
24374,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Male,MD,Count,1990,404,


### Removes killed by intimate partner disaggregation

In [9]:
df_fem = df_fem[df_fem['disaggregation'] == '-']
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
8,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2015,93,
9,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,"Rate per 100,000 population",2015,0.56,
10,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Male,NSO,Count,2015,3274,
11,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Male,NSO,"Rate per 100,000 population",2015,18.51,
14,Asia,Southern Asia,Afghanistan,AFG,Homicide: # of victims,-,Female,NSO,Count,2016,101,
...,...,...,...,...,...,...,...,...,...,...,...,...
24293,Americas,Latin America and the Caribbean,Venezuela (Bolivarian Republic of),VEN,Homicide: # of victims,-,Male,CTS,"Rate per 100,000 population",2017,93.88,
24372,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Female,MD,Count,1990,183,
24373,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Female,MD,"Rate per 100,000 population",1990,3.49,
24374,Africa,Sub-Saharan Africa,Zimbabwe,ZWE,Homicide: # of victims,-,Male,MD,Count,1990,404,


### Filters out counts and leaves only rate per 100k, sorts values and converts value column to numeric

In [10]:
df_fem = df_fem[df_fem['unit'] != 'Count']
df_fem = df_fem.sort_values(['year', 'gender', 'iso3'])
df_fem['value'] = pd.to_numeric(df_fem['value'], errors='coerce')
df_fem

Unnamed: 0,region,subregion,country,iso3,indicator,disaggregation,gender,source,unit,year,value,footnote
1193,Americas,Latin America and the Caribbean,Aruba,ABW,Homicide: # of victims,-,Female,MD,"Rate per 100,000 population",1990,0.00,
899,Asia,Western Asia,Armenia,ARM,Homicide: # of victims,-,Female,UNECE,"Rate per 100,000 population",1990,1.25,
675,Americas,Latin America and the Caribbean,Antigua and Barbuda,ATG,Homicide: # of victims,-,Female,MD,"Rate per 100,000 population",1990,3.10,
1295,Oceania,Australia and New Zealand,Australia,AUS,Homicide: # of victims,-,Female,MD,"Rate per 100,000 population",1990,1.62,
1561,Europe,Western Europe,Austria,AUT,Homicide: # of victims,-,Female,MD,"Rate per 100,000 population",1990,0.99,
...,...,...,...,...,...,...,...,...,...,...,...,...
22001,Europe,Northern Europe,Sweden,SWE,Homicide: # of victims,-,Male,External,"Rate per 100,000 population",2020,1.96,
22427,Asia,Central Asia,Tajikistan,TJK,Homicide: # of victims,-,Male,CTS,"Rate per 100,000 population",2020,1.81,
22665,Oceania,Polynesia,Tonga,TON,Homicide: # of victims,-,Male,External,"Rate per 100,000 population",2020,3.78,
22845,Africa,Northern Africa,Tunisia,TUN,Homicide: # of victims,-,Male,External,"Rate per 100,000 population",2020,7.63,


### Pivots df, using year and gender as index. Then formats new columns

In [11]:
df_fem = df_fem.pivot(index='iso3', columns=['year', 'gender'], values='value')
df_fem.columns = ['-'.join(col) for col in df_fem.columns]
df_fem

Unnamed: 0_level_0,1990-Female,1990-Male,1991-Female,1991-Male,1992-Female,1992-Male,1993-Female,1993-Male,1994-Female,1994-Male,...,2016-Female,2016-Male,2017-Female,2017-Male,2018-Female,2018-Male,2019-Female,2019-Male,2020-Female,2020-Male
iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABW,0.00,0.00,,,,,,,,,...,,,,,,,,,,
ARM,1.25,9.05,1.64,9.23,,,3.08,13.31,1.43,8.48,...,0.90,5.00,0.71,4.26,0.70,2.81,0.64,4.53,1.02,2.58
ATG,3.10,0.00,6.12,3.26,3.00,3.21,0.00,12.64,2.86,12.42,...,4.08,15.38,4.05,39.14,6.02,19.39,1.99,4.27,,
AUS,1.62,2.79,1.66,2.30,1.34,2.12,1.34,2.45,1.28,2.32,...,0.66,1.22,0.51,1.18,0.67,1.09,0.54,1.24,0.58,1.18
AUT,0.99,1.33,1.16,1.51,1.05,1.44,0.85,1.19,1.27,0.93,...,0.70,0.54,0.82,0.76,0.97,0.96,0.88,0.84,0.81,0.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FSM,,,,,,,,,,,...,,,,,,,1.79,0.00,,
SYR,,,,,,,,,,,...,,,,,,,8.77,30.59,,
TUN,,,,,,,,,,,...,,,,,,,1.75,8.07,1.93,7.63
VUT,,,,,,,,,,,...,,,,,,,0.68,0.66,0.66,0.00


In [12]:
df_fem.isna().sum()

1990-Female    114
1990-Male      114
1991-Female    117
1991-Male      117
1992-Female    115
              ... 
2018-Male       60
2019-Female     57
2019-Male       57
2020-Female     86
2020-Male       87
Length: 62, dtype: int64

In [13]:
df_fem.dtypes

1990-Female    float64
1990-Male      float64
1991-Female    float64
1991-Male      float64
1992-Female    float64
                ...   
2018-Male      float64
2019-Female    float64
2019-Male      float64
2020-Female    float64
2020-Male      float64
Length: 62, dtype: object

# Economic data

## Worldbank data import

### Economic data is imported from worldbank API
1. NY.GNP.PCAP.CD represents gross national income per capita

2. SI.POV.GINI represents the gini coefficient of each country

3. Years 2000-2023 are selected

4. Labels also returns country code

5. skipBlanks is turned off so we have a row for each year (easier merging)

6. each series is added to column

7. skipAggs returns only countries and not regions/aggregations

8. Result is cached as can be slow to return

In [14]:
%cache df_wb = wb.data.DataFrame(['NY.GNP.PCAP.CD','SI.POV.GINI'], time=range(2000,2023), labels=True, skipBlanks =False, columns='series', skipAggs=True)

loading cached value for variable 'df_wb'. Time since pickling  1:16:47.020425


### Index is reset, columns renamed and converted to datetime format. Extra time column is dropped and index is set to country code and year

In [15]:
df_wb = df_wb.reset_index()

df_wb = df_wb.rename(columns={'economy':'iso3',
                        'Time':'year',
                        'NY.GNP.PCAP.CD':'gnipc',
                        'SI.POV.GINI':'gini',
                        'Country':'country'})

df_wb['year'] = pd.to_datetime(df_wb['year'], format='%Y')

df_wb['gnipc'] = pd.to_numeric(df_wb['gnipc'], errors='coerce')
df_wb['gini'] = pd.to_numeric(df_wb['gini'], errors='coerce')

df_wb = df_wb.drop(columns='time')

df_wb = df_wb.set_index(['iso3', 'year'])
df_wb

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZWE,2021-01-01,Zimbabwe,1530.0,
ZWE,2020-01-01,Zimbabwe,1460.0,
ZWE,2019-01-01,Zimbabwe,1450.0,50.3
ZWE,2018-01-01,Zimbabwe,1550.0,
ZWE,2017-01-01,Zimbabwe,1170.0,44.3
...,...,...,...,...
AFG,2004-01-01,Afghanistan,,
AFG,2003-01-01,Afghanistan,,
AFG,2002-01-01,Afghanistan,,
AFG,2001-01-01,Afghanistan,,


In [16]:
df_wb.isna().sum()

country       0
gnipc       539
gini       3317
dtype: int64

In [17]:
df_wb.dtypes

country     object
gnipc      float64
gini       float64
dtype: object

## CIA data

### Gets webpage from url

In [18]:
url = 'https://www.cia.gov/the-world-factbook/field/gini-index-coefficient-distribution-of-family-income/country-comparison'

%cache r = requests.get(url)

soup = BeautifulSoup(r.text, 'lxml')

loading cached value for variable 'r'. Time since pickling  1 day, 19:43:17.842068


### Finds first table on page, extracts headers from first row and then loops over table and creates dataframe from values

In [19]:
table = soup.findAll('table')[0]

headers = [x.text for x in table.findChildren('th')[1:]]

row_list = headers
table_list = []
for row in table.findChildren('tr'):

    for cell in row.findChildren('td')[1:]:
        row_list.append(cell.text)
    table_list.append(row_list)
    row_list = []

df_cia = pd.DataFrame(table_list)

### Adds gini label to first row and then promotes first row to headers 

In [20]:
df_cia.iloc[0,1]='gini'

df_cia.columns= df_cia.iloc[0,:]

df_cia = df_cia[1:]

df_cia

Unnamed: 0,Country,gini,Date of Information
1,South Africa,63.0,2014 est.
2,Namibia,59.1,2015 est.
3,Zambia,57.1,2015 est.
4,Central African Republic,56.2,2008 est.
5,Eswatini,54.6,2016 est.
...,...,...,...
173,Slovakia,25.2,2016 est.
174,Belarus,24.4,2020 est.
175,Slovenia,24.2,2017 est.
176,Faroe Islands,22.7,2013 est.


### iso3 codes are generated for CIA data set by the worldbank api, Nas are dropped, year and iso3 set as index ready for filling in blanks 

In [21]:
df_cia['iso3'] = wb.economy.coder(df_cia['Country'])
df_cia = df_cia.dropna(subset='iso3')
df_cia['year'] =df_cia['Date of Information'].str[:4]
df_cia = df_cia[['iso3', 'year', 'Country', 'gini']]
df_cia['year'] = pd.to_datetime(df_cia['year'], format='%Y')
df_cia = df_cia.set_index(['iso3', 'year'])
df_cia

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cia['year'] =df_cia['Date of Information'].str[:4]


Unnamed: 0_level_0,Unnamed: 1_level_0,Country,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZAF,2014-01-01,South Africa,63.0
NAM,2015-01-01,Namibia,59.1
ZMB,2015-01-01,Zambia,57.1
CAF,2008-01-01,Central African Republic,56.2
SWZ,2016-01-01,Eswatini,54.6
...,...,...,...
ARM,2020-01-01,Armenia,25.2
SVK,2016-01-01,Slovakia,25.2
BLR,2020-01-01,Belarus,24.4
SVN,2017-01-01,Slovenia,24.2


In [22]:
df_wb.isna().sum()

country       0
gnipc       539
gini       3317
dtype: int64

### Fills in any empty cells in wb dataset with values from cia dataset ~17 extra countries. gini column set back to float

In [23]:
df_eco = df_wb.copy()

df_eco['gini'] = df_eco['gini'].fillna(df_cia['gini'])

df_eco = df_eco.sort_index()

df_eco['gini'] =df_eco['gini'].astype('float64')

df_eco

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABW,2000-01-01,Aruba,20390.0,
ABW,2001-01-01,Aruba,20510.0,
ABW,2002-01-01,Aruba,19290.0,
ABW,2003-01-01,Aruba,21120.0,
ABW,2004-01-01,Aruba,24010.0,
...,...,...,...,...
ZWE,2017-01-01,Zimbabwe,1170.0,44.3
ZWE,2018-01-01,Zimbabwe,1550.0,
ZWE,2019-01-01,Zimbabwe,1450.0,50.3
ZWE,2020-01-01,Zimbabwe,1460.0,


In [24]:
df_eco.isna().sum()

country       0
gnipc       539
gini       3300
dtype: int64

In [25]:
df_eco.dtypes

country     object
gnipc      float64
gini       float64
dtype: object

In [26]:
df_eco.to_csv(r'processed_data/wb_cia_economic_all_rows.csv')

## Dichotamising eco data

### df is split between gini and gnipc dataframes for dichotamising

In [27]:
df_eco = df_eco.reset_index()
df_gini = df_eco[['iso3', 'year', 'country', 'gini']]
df_gnipc = df_eco[['iso3', 'year', 'country', 'gnipc']]

### Nas are dropped, results are grouped by iso3 and returns last year that there is a value for gnipc (latest year that there is data for. Columns are then renamed

In [28]:
df_gnipc = df_gnipc.dropna(subset='gnipc')
df_gnipc = df_gnipc.sort_values('year').groupby('iso3').last()
df_gnipc = df_gnipc.rename(columns={'year' : 'gnipc_year'})
df_gnipc

Unnamed: 0_level_0,gnipc_year,country,gnipc
iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,2021-01-01,Aruba,29390.0
AFG,2021-01-01,Afghanistan,390.0
AGO,2021-01-01,Angola,1710.0
ALB,2021-01-01,Albania,6110.0
AND,2019-01-01,Andorra,46530.0
...,...,...,...
XKX,2021-01-01,Kosovo,5130.0
YEM,2018-01-01,"Yemen, Rep.",840.0
ZAF,2021-01-01,South Africa,6530.0
ZMB,2021-01-01,Zambia,1030.0


### Nas are dropped, results are grouped by iso3 and returns last year that there is a gini value for (latest year that there is data for. Columns are then renamed

In [29]:
df_gini = df_gini.dropna(subset='gini')
df_gini = df_gini.sort_values('year').groupby('iso3').last()
df_gini = df_gini.rename(columns={'year' : 'gini_year'})
df_gini

Unnamed: 0_level_0,gini_year,country,gini
iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AFG,2008-01-01,Afghanistan,29.4
AGO,2018-01-01,Angola,51.3
ALB,2019-01-01,Albania,30.8
ARE,2018-01-01,United Arab Emirates,26.0
ARG,2020-01-01,Argentina,42.3
...,...,...,...
XKX,2017-01-01,Kosovo,29.0
YEM,2014-01-01,"Yemen, Rep.",36.7
ZAF,2014-01-01,South Africa,63.0
ZMB,2015-01-01,Zambia,57.1


### World median gnipc is calculated to be used in dichotamisation

In [30]:
world_median_gnipc = df_gnipc['gnipc'].median()
world_median_gnipc

6460.0

### World median gini is calculated to be used in dichotamisation

In [31]:
world_median_gini = df_gini['gini'].median()
world_median_gini

35.9

### If country's gnipc is higher than the world median gnipc then it is classed as high income_level

In [32]:
df_gnipc['income_level'] = np.where(df_gnipc['gnipc'] > world_median_gnipc, 'high', 'low')
df_gnipc

Unnamed: 0_level_0,gnipc_year,country,gnipc,income_level
iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABW,2021-01-01,Aruba,29390.0,high
AFG,2021-01-01,Afghanistan,390.0,low
AGO,2021-01-01,Angola,1710.0,low
ALB,2021-01-01,Albania,6110.0,low
AND,2019-01-01,Andorra,46530.0,high
...,...,...,...,...
XKX,2021-01-01,Kosovo,5130.0,low
YEM,2018-01-01,"Yemen, Rep.",840.0,low
ZAF,2021-01-01,South Africa,6530.0,high
ZMB,2021-01-01,Zambia,1030.0,low


### If country's gini is higher than the world median gini then it is classed as high inequality_level

In [33]:
df_gini['inequality_level'] = np.where(df_gini['gini'] > world_median_gini, 'high', 'low')
df_gini

Unnamed: 0_level_0,gini_year,country,gini,inequality_level
iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFG,2008-01-01,Afghanistan,29.4,low
AGO,2018-01-01,Angola,51.3,high
ALB,2019-01-01,Albania,30.8,low
ARE,2018-01-01,United Arab Emirates,26.0,low
ARG,2020-01-01,Argentina,42.3,high
...,...,...,...,...
XKX,2017-01-01,Kosovo,29.0,low
YEM,2014-01-01,"Yemen, Rep.",36.7,high
ZAF,2014-01-01,South Africa,63.0,high
ZMB,2015-01-01,Zambia,57.1,high


### gini and gnipc dataframes are then joined to form the full dichotamised dataframe. Uneeded columns are dropped and columns are renamed

In [34]:
df_eco_dichot = df_gnipc.merge(df_gini, how='outer', on='iso3')
df_eco_dichot = df_eco_dichot[['country_x', 'income_level', 'inequality_level', 'gnipc', 'gini', 'gnipc_year', 'gini_year' ]]
df_eco_dichot = df_eco_dichot.dropna(subset='country_x')
df_eco_dichot = df_eco_dichot.rename(columns={'country_x' : 'country'})
df_eco_dichot

Unnamed: 0_level_0,country,income_level,inequality_level,gnipc,gini,gnipc_year,gini_year
iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ABW,Aruba,high,,29390.0,,2021-01-01,NaT
AFG,Afghanistan,low,low,390.0,29.4,2021-01-01,2008-01-01
AGO,Angola,low,high,1710.0,51.3,2021-01-01,2018-01-01
ALB,Albania,low,low,6110.0,30.8,2021-01-01,2019-01-01
AND,Andorra,high,,46530.0,,2019-01-01,NaT
...,...,...,...,...,...,...,...
XKX,Kosovo,low,low,5130.0,29.0,2021-01-01,2017-01-01
YEM,"Yemen, Rep.",low,high,840.0,36.7,2018-01-01,2014-01-01
ZAF,South Africa,high,high,6530.0,63.0,2021-01-01,2014-01-01
ZMB,Zambia,low,high,1030.0,57.1,2021-01-01,2015-01-01


In [35]:
df_eco_dichot.isna().sum()

country              0
income_level         0
inequality_level    35
gnipc                0
gini                35
gnipc_year           0
gini_year           35
dtype: int64

In [36]:
df_eco_dichot.dtypes

country                     object
income_level                object
inequality_level            object
gnipc                      float64
gini                       float64
gnipc_year          datetime64[ns]
gini_year           datetime64[ns]
dtype: object

In [37]:
df_eco_dichot.to_csv(r'processed_data/eco_dichot.csv')

# Merging

### The dichotamised economic dataframe is merged with the femicide dataframe on the shared iso3 index

In [38]:
df = df_eco_dichot.merge(df_fem, how='left', left_index=True, right_index=True)
df

Unnamed: 0_level_0,country,income_level,inequality_level,gnipc,gini,gnipc_year,gini_year,1990-Female,1990-Male,1991-Female,...,2016-Female,2016-Male,2017-Female,2017-Male,2018-Female,2018-Male,2019-Female,2019-Male,2020-Female,2020-Male
iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABW,Aruba,high,,29390.0,,2021-01-01,NaT,0.00,0.00,,...,,,,,,,,,,
AFG,Afghanistan,low,low,390.0,29.4,2021-01-01,2008-01-01,,,,...,0.59,12.19,0.75,12.28,0.85,12.16,,,,
AGO,Angola,low,high,1710.0,51.3,2021-01-01,2018-01-01,,,,...,,,,,,,,,,
ALB,Albania,low,low,6110.0,30.8,2021-01-01,2019-01-01,,,,...,1.20,4.22,0.92,3.06,1.06,3.47,0.92,3.55,0.64,3.55
AND,Andorra,high,,46530.0,,2019-01-01,NaT,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XKX,Kosovo,low,low,5130.0,29.0,2021-01-01,2017-01-01,,,,...,,,,,,,,,,
YEM,"Yemen, Rep.",low,high,840.0,36.7,2018-01-01,2014-01-01,,,,...,,,,,,,,,,
ZAF,South Africa,high,high,6530.0,63.0,2021-01-01,2014-01-01,,,,...,9.27,59.05,10.14,61.96,9.46,64.10,,,,
ZMB,Zambia,low,high,1030.0,57.1,2021-01-01,2015-01-01,,,,...,,,,,,,,,,


In [39]:
df.to_csv(r'processed_data/fin_df.csv')