In [1]:
# This script takes the Maddison and Polity datasets, and interpolates or imputes missing values
# It then generates a dataframe in which each row is a year since 1900 for a country, with Maddison, Polity and combat data

In [2]:
import pandas as pd
import numpy as np
import datetime
import dateutil.parser
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
from tqdm import tqdm_notebook
pd.options.mode.chained_assignment = None

## Step 1: import and interpolate Maddison regional data ##

In [3]:
# Dataframe containing Maddison estimates for each region in each year, with some adjustments (see methodology)
maddison_region_df = pd.read_csv("source_data/maddison_economic_data/maddison_cleaned_regions.csv") 

In [4]:
from scipy.interpolate import interp1d

In [5]:
# Create linear interpolations for Maddison population and GDP per capita figures in each region
interpolated_gdp_per_capita_list = []
interpolated_population_list = []

# Loop through each region
for r in maddison_region_df["region"].unique():
    
    # Create "x" series, of the years for the population and GDP per capita figures
    gdp_x = maddison_region_df[(maddison_region_df["region"]==r) & (maddison_region_df["gdp_per_capita"]>=0)]["year"]
    pop_x = maddison_region_df[(maddison_region_df["region"]==r) & (maddison_region_df["population"]>=0)]["year"]
    
    # Create "y" series, of the actual figures for population and GDP per capita
    gdp_y = maddison_region_df[(maddison_region_df["region"]==r) & (maddison_region_df["gdp_per_capita"]>=0)]["gdp_per_capita"]
    pop_y = maddison_region_df[(maddison_region_df["region"]==r) & (maddison_region_df["population"]>=0)]["population"]
    
    # Create interpolation functions for the population and GDP per capita series
    gdp_f = interp1d(gdp_x, gdp_y, fill_value="extrapolate")
    pop_f = interp1d(pop_x, pop_y, fill_value="extrapolate")
    
    # Use interpolation functions on all years
    years = [y for y in range(1900,2018)]
    interpolated_gdp_per_capita_list = interpolated_gdp_per_capita_list + list(gdp_f(years))
    interpolated_population_list = interpolated_population_list + list(pop_f(years))
maddison_region_df["interpolated_gdp_per_capita"] = interpolated_gdp_per_capita_list
maddison_region_df["interpolated_population"] = interpolated_population_list

In [6]:
maddison_region_df

Unnamed: 0,year,region,gdp_per_capita,population,interpolated_gdp_per_capita,interpolated_population
0,1900,Africa,958.0,135290.0,958.00,135290.00
1,1901,Africa,,,970.76,137093.88
2,1902,Africa,,,983.52,138897.76
3,1903,Africa,,,996.28,140701.64
4,1904,Africa,,,1009.04,142505.52
5,1905,Africa,,,1021.80,144309.40
6,1906,Africa,,,1034.56,146113.28
7,1907,Africa,,,1047.32,147917.16
8,1908,Africa,,,1060.08,149721.04
9,1909,Africa,,,1072.84,151524.92


## Step 2: import and interpolate Maddison country data ##

In [7]:
# Dataframe containing Maddison estimates for each country in each year, with some adjustments (see methodology)
maddison_country_df = pd.read_excel("source_data/maddison_economic_data/maddison_country_data.xlsx",sheetname="Full data")

In [8]:
# This dataframe lists all countries that have Maddison economic data and Polity regime data, with both Maddison and COW codes
country_converter_df = pd.read_csv("source_data/maddison_economic_data/country_converter.csv")

In [9]:
# Create a list of lists, where each row contains a country's region and existing Maddison data
country_year_lists = []
for c in tqdm_notebook(list(maddison_country_df["countrycode"].unique())): # Loop through countries
    country_name = maddison_country_df[maddison_country_df["countrycode"]==c].iloc[0]["country"]
    country_region = country_converter_df[country_converter_df["participant_maddison_code"]==c].iloc[0]["participant_region"]
    for y in years: # Loop through years
        country_population = None
        country_gdp_per_capita = None
        try:
            # Find relevant country row, if it exists
            country_year_row = maddison_country_df[(maddison_country_df["countrycode"]==c) & (maddison_country_df["year"]==y)].iloc[0]
            country_population = country_year_row["pop"] * 1000
            country_gdp_per_capita = country_year_row["cgdppc"]
        except Exception:
            pass
        country_year_lists.append([c,country_name,country_region,y,country_population,country_gdp_per_capita])




In [10]:
maddison_country_years_df = pd.DataFrame(country_year_lists,columns=["maddison_code","country_name","country_region","year","population","gdp_per_capita"])

In [11]:
# Function that interpolates a country's GDP per capita
def get_country_historical_gdp_per_capita(code):
    
    # Select relevant country rows from maddison_country_years_df, and relevant region rows from maddison_region_df
    country_rows = maddison_country_years_df[maddison_country_years_df["maddison_code"]==code]
    country_region = country_rows.iloc[0]["country_region"]
    region_rows = maddison_region_df[maddison_region_df["region"]==country_region]
    
    # Find the first year in which the country has GDP per capita figures
    first_gdp_year = np.min(country_rows[country_rows["gdp_per_capita"]>0]["year"])
    
    # Identify whether the country has any GDP per capita figures for 1900
    if not country_rows[country_rows["year"]==1900].iloc[0]["gdp_per_capita"] > 0:
        
        # If not, then create a figure for 1900 using the growth rate of the region (see methodology)
        gdp_x = [1900] # Year for 1900
        gdp_y = []
        
        # Find the country's first year with GDP per capita figures
        first_gdp_figure = country_rows[country_rows["year"]==first_gdp_year].iloc[0]["gdp_per_capita"] 
        
        # For the relevant region, calculate the ratio between its 1900 GDP per capita and its GDP per capita in the country's first year
        region_gdp_ratio = region_rows[region_rows["year"]==1900].iloc[0]["interpolated_gdp_per_capita"] / region_rows[region_rows["year"]==first_gdp_year].iloc[0]["interpolated_gdp_per_capita"]
        
        # Use this ratio to estimate a 1900 GDP per capita figure for the country
        gdp_y.append(first_gdp_figure*region_gdp_ratio) 
        
        # Create an "x" series of years and a "y" series of GDP per capita figures
        gdp_x = gdp_x + list(country_rows[(country_rows["gdp_per_capita"]>0) & (country_rows["year"]>1900)]["year"])
        gdp_y = gdp_y + list(country_rows[(country_rows["gdp_per_capita"]>0) & (country_rows["year"]>1900)]["gdp_per_capita"])
        
        # Create an interpolation function, and run it over all years 1900-2017
        gdp_f = interp1d(gdp_x, gdp_y, fill_value="extrapolate")
        gdp_per_capita_list = list(gdp_f(years))
        
    else:
        
        # If the country does have GDP per capita figures for 1900, simply create a linear interpolation
        gdp_x = list(country_rows[country_rows["gdp_per_capita"]>0]["year"])
        gdp_y = list(country_rows[country_rows["gdp_per_capita"]>0]["gdp_per_capita"])
        gdp_f = interp1d(gdp_x, gdp_y, fill_value="extrapolate")
        gdp_per_capita_list = list(gdp_f(years))
    return gdp_per_capita_list

In [12]:
# Function that interpolates a country's population
def get_country_historical_population(code):
    
    # Select relevant country rows from maddison_country_years_df, and relevant region rows from maddison_region_df
    country_rows = maddison_country_years_df[maddison_country_years_df["maddison_code"]==code]
    country_region = country_rows.iloc[0]["country_region"]
    region_rows = maddison_region_df[maddison_region_df["region"]==country_region]
    
    # Find the first year in which the country has population figures
    first_pop_year = np.min(country_rows[country_rows["population"]>0]["year"])
    
    # Identify whether the country has any population figures for 1900
    if not country_rows[country_rows["year"]==1900].iloc[0]["population"] > 0:
        
        # If not, then create a figure for 1900 using the growth rate of the region (see methodology)
        pop_x = [1900]
        pop_y = []
        
        # Find the country's first year with population figures
        first_pop_figure = country_rows[country_rows["year"]==first_pop_year].iloc[0]["population"]
        
        # For the relevant region, calculate the ratio between its 1900 population and its population in the country's first year
        region_pop_ratio = region_rows[region_rows["year"]==1900].iloc[0]["interpolated_population"] / region_rows[region_rows["year"]==first_pop_year].iloc[0]["interpolated_population"]
        
        # Use this ratio to estimate a 1900 population figure for the country
        pop_y.append(first_pop_figure*region_pop_ratio)
        
        # Create an "x" series of years and a "y" series of population figures
        pop_x = pop_x + list(country_rows[(country_rows["population"]>0) & (country_rows["year"]>1900)]["year"])
        pop_y = pop_y + list(country_rows[(country_rows["population"]>0) & (country_rows["year"]>1900)]["population"])
        
        # Create an interpolation function, and run it over all years 1900-2017
        pop_f = interp1d(pop_x, pop_y, fill_value="extrapolate")
        population_list = list(pop_f(years))
        
    else:
        
         # If the country does have GDP population figures for 1900, simply create a linear interpolation
        pop_x = list(country_rows[country_rows["population"]>0]["year"])
        pop_y = list(country_rows[country_rows["population"]>0]["population"])
        pop_f = interp1d(pop_x, pop_y, fill_value="extrapolate")
        population_list = list(pop_f(years))
    return population_list

In [13]:
# Use the interpolation functions for GDP per capita and population over all countries
# Create a dataframe containing a row for each country in each year, with interpolated Maddison figures
country_years_gdp_per_capita_list = []
country_years_population_list = []
for c in list(maddison_country_years_df["maddison_code"].unique()):
    country_years_gdp_per_capita_list = country_years_gdp_per_capita_list + get_country_historical_gdp_per_capita(c)
    country_years_population_list = country_years_population_list + get_country_historical_population(c)
maddison_country_years_df["interpolated_gdp_per_capita"] = country_years_gdp_per_capita_list
maddison_country_years_df["interpolated_population"] = country_years_population_list

In [14]:
maddison_country_years_df

Unnamed: 0,maddison_code,country_name,country_region,year,population,gdp_per_capita,interpolated_gdp_per_capita,interpolated_population
0,AFG,Afghanistan,Middle East,1900,,,1435.601006,4.883533e+06
1,AFG,Afghanistan,Middle East,1901,,,1454.728986,4.948646e+06
2,AFG,Afghanistan,Middle East,1902,,,1473.856966,5.013758e+06
3,AFG,Afghanistan,Middle East,1903,,,1492.984946,5.078871e+06
4,AFG,Afghanistan,Middle East,1904,,,1512.112925,5.143984e+06
5,AFG,Afghanistan,Middle East,1905,,,1531.240905,5.209097e+06
6,AFG,Afghanistan,Middle East,1906,,,1550.368885,5.274210e+06
7,AFG,Afghanistan,Middle East,1907,,,1569.496865,5.339323e+06
8,AFG,Afghanistan,Middle East,1908,,,1588.624845,5.404436e+06
9,AFG,Afghanistan,Middle East,1909,,,1607.752825,5.469549e+06


## Step 3: join historical conflict data to Maddison country data ##

In [15]:
# Dataframe which contains the death tolls for each country in each conflict in each year
country_conflict_years_df = pd.read_csv("output_data/country_conflict_years_df.csv",encoding="latin1")

In [16]:
# robustness_check_conflict_types can be used to isolate particular types of conflict for robustness checks
robustness_check_conflict_types = ["Interstate", "Extrasystemic", "Internal", "Internationalised internal"]
country_conflict_years_df = country_conflict_years_df[country_conflict_years_df["state_conflict_type"].isin(robustness_check_conflict_types)]

In [17]:
# Create series of death tolls in each country in each year, to append to the Maddison country years dataframe
maddison_country_year_conflict_deaths_list = []

# Loop through each row in maddison_country_years_df, which is a country in a year
for i,r in tqdm_notebook(maddison_country_years_df.iterrows(),total=maddison_country_years_df.shape[0]):
    country_code = r["maddison_code"]
    country_year = r["year"]
    maddison_country_year_conflict_deaths = 0
    
    try:
        # Find all rows in country_conflict_years_df for that country in that year, and sum the deaths
        maddison_country_year_conflict_rows = country_conflict_years_df[(country_conflict_years_df["participant_maddison_code"]==country_code) & (country_conflict_years_df["year"]==country_year)]
        maddison_country_year_conflict_deaths = np.sum(maddison_country_year_conflict_rows["year_deaths"])
    except Exception:
        pass
    maddison_country_year_conflict_deaths_list.append(maddison_country_year_conflict_deaths)
    
# Append the series to maddison_country_years_df
maddison_country_years_df["country_year_conflict_deaths"] = maddison_country_year_conflict_deaths_list




In [18]:
# Create column for maddison_country_years_df that calculates the deaths per 100,000 people
maddison_country_years_df["country_year_conflict_death_rate"] = maddison_country_years_df["country_year_conflict_deaths"] / maddison_country_years_df["interpolated_population"]

# Create column for maddison_country_years_df that calculates whether a country had more than 100 deaths in a year
# This threshold will be used to measure whether a country fights in a conflict or not
maddison_country_years_df["country_involved_in_conflict"] = 1*(maddison_country_years_df["country_year_conflict_deaths"] >= 100)
maddison_country_years_df

Unnamed: 0,maddison_code,country_name,country_region,year,population,gdp_per_capita,interpolated_gdp_per_capita,interpolated_population,country_year_conflict_deaths,country_year_conflict_death_rate,country_involved_in_conflict
0,AFG,Afghanistan,Middle East,1900,,,1435.601006,4.883533e+06,0.0,0.000000,0
1,AFG,Afghanistan,Middle East,1901,,,1454.728986,4.948646e+06,0.0,0.000000,0
2,AFG,Afghanistan,Middle East,1902,,,1473.856966,5.013758e+06,0.0,0.000000,0
3,AFG,Afghanistan,Middle East,1903,,,1492.984946,5.078871e+06,0.0,0.000000,0
4,AFG,Afghanistan,Middle East,1904,,,1512.112925,5.143984e+06,0.0,0.000000,0
5,AFG,Afghanistan,Middle East,1905,,,1531.240905,5.209097e+06,0.0,0.000000,0
6,AFG,Afghanistan,Middle East,1906,,,1550.368885,5.274210e+06,0.0,0.000000,0
7,AFG,Afghanistan,Middle East,1907,,,1569.496865,5.339323e+06,0.0,0.000000,0
8,AFG,Afghanistan,Middle East,1908,,,1588.624845,5.404436e+06,0.0,0.000000,0
9,AFG,Afghanistan,Middle East,1909,,,1607.752825,5.469549e+06,0.0,0.000000,0


## Step 4: join historical conflict to Polity 4 country data

In [19]:
# Dataframe containing Polity scores of democracy and autocracy for each country in each year
polity_df = pd.read_csv("source_data/polity_4_democracy_data/polity_4_democracy_scores.csv")

In [20]:
polity_df

Unnamed: 0,cyear,ccode,scode,country,year,flag,fragment,democ,autoc,polity,...,interim,bmonth,bday,byear,bprec,post,change,d4,sf,regtrans
0,21800,2,USA,United States,1800,0,,7,3,4,...,,1.0,1.0,1800.0,1.0,4.0,88.0,1.0,,
1,21801,2,USA,United States,1801,0,,7,3,4,...,,,,,,,,,,
2,21802,2,USA,United States,1802,0,,7,3,4,...,,,,,,,,,,
3,21803,2,USA,United States,1803,0,,7,3,4,...,,,,,,,,,,
4,21804,2,USA,United States,1804,0,,7,3,4,...,,,,,,,,,,
5,21805,2,USA,United States,1805,0,,7,3,4,...,,,,,,,,,,
6,21806,2,USA,United States,1806,0,,7,3,4,...,,,,,,,,,,
7,21807,2,USA,United States,1807,0,,7,3,4,...,,,,,,,,,,
8,21808,2,USA,United States,1808,0,,7,3,4,...,,,,,,,,,,
9,21809,2,USA,United States,1809,0,,9,0,9,...,,3.0,5.0,1809.0,1.0,9.0,5.0,1.0,,2.0


In [21]:
# Create a dictionary for all Eastern European blocs, which specifies which bloc / country_id to use in each year
# All former Soviet states are treated as part of the Russian empire from 1900 to 1921, and the Soviet Union from 1922 to 1989 
polity_bloc_dict = {364:{1900:365}, # USSR
                      371:{1900:365,1922:364}, # Armenia
                      373:{1900:365,1922:364}, # Azerbaijan
                      370:{1900:365,1922:364}, # Belarus
                      366:{1900:365,1922:364}, # Estonia
                      372:{1900:365,1922:364}, # Georgia
                      705:{1900:365,1922:364}, # Kazakhstan
                      703:{1900:365,1922:364}, # Kyrgyzstan
                      367:{1900:365,1922:364}, # Latvia
                      368:{1900:365,1922:364}, # Lithuania
                      359:{1900:365,1922:364}, # Moldova
                      365:{1922:364}, # Russia
                      702:{1900:365,1922:364}, # Tajikstan
                      701:{1900:365,1922:364}, # Turkmenistan
                      369:{1900:365,1922:364}, # Ukraine
                      704:{1900:365,1922:364}, # Uzbekistan
                      346:{1900:342,1921:345}, # Bosnia
                      344:{1900:342,1921:345}, # Croatia
                      343:{1900:342,1921:345}, # Macedonia
                      348:{1900:342,1921:345,1991:347}, # Montenegro
                      342:{1921:345,1991:347}, # Serbia
                      349:{1900:342,1921:345}, # Slovenia
                      316:{1918:315}, # Czech Republic
                      317:{1918:315}, # Slovakia
                      }

In [22]:
# Function that generates a Polity score for each country in each year
def get_country_historical_democracy_scores(code):
    
    # Get all gwn_ids for the Maddison country code (some Maddison codes have multiple COW states)
    country_gwn_ids = list(country_converter_df[country_converter_df["participant_maddison_code"]==code]["gwn_id"]) 
    country_gwn_ids_years_democracy_scores_list = []
    
    # Loop through each gwn_id
    for i in country_gwn_ids:
        country_years_democracy_scores_list = []
        alternative_country_gwn_id = None # For countries that were part of historical blocs in polity_bloc_dict
        country_last_democracy_score = None # For interregnum years with no polity score
        
        # Loop through all years from 1900 to 2017
        for y in years:
            country_democracy_score = None # The default value if a country has never previously had a polity2 score
            
            # Find democracy score for gwn_id and year
            try:
                country_year_row = polity_df[(polity_df["ccode"]==i) & (polity_df["year"]==y)].iloc[0]
                
                # If the country has a democracy score or -88 code (signalling transition), use its polity2 score
                if country_year_row["democ"] >= 0 or country_year_row["democ"] <= -88:
                    country_democracy_score = country_year_row["polity2"]
                    country_last_democracy_score = country_democracy_score # Assign this polity2 score as the last year before an interregnum
                else:
                    # If the country has a -66 or -77 code, signalling interruption or interregnum, use its last polity 2 score
                    country_democracy_score = country_last_democracy_score
                    
            except Exception:
                
                # If the gwn_id for the country has no Polity rows, try searching for rows of any bloc that it belongs to
                try:
                    
                    # If polity_bloc_dict contains the country, loop through all of its start years
                    for k,v in polity_bloc_dict[i].items():
                        
                        # Check if the relevant year on the 1900 to 2017 loop is greater than the start year in the dictionary 
                        if y >= k:
                            
                            # If so, use the bloc's gwn_id instead of the country's gwn_id
                            alternative_country_gwn_id = v
                    
                    # Find the relevant year for the bloc in polity_df
                    country_year_row = polity_df[(polity_df["ccode"]==alternative_country_gwn_id) & (polity_df["year"]==y)].iloc[0]
                    
                    # If the bloc has a democracy score or -88 code (signalling transition), use its polity2 score
                    if country_year_row["democ"] >= 0 or country_year_row["democ"] <= -88:
                        country_democracy_score = country_year_row["polity2"]
                        country_last_democracy_score = country_democracy_score # Assign this polity2 score as the last year before an interregnum
                    else:
                        # If the bloc has a -66 or -77 code, signalling interruption or interregnum, use its last polity 2 score
                        country_democracy_score = country_last_democracy_score
                        
                except Exception:
                    pass
                pass
            
            country_years_democracy_scores_list.append(country_democracy_score) # Add the year's polity2 score to the gwn_id's list
        
        # If a country has multiple gwn_ids, add the polity2 scores for all of those to the list of lists
        country_gwn_ids_years_democracy_scores_list.append(country_years_democracy_scores_list) 
    
    # Concatenate all gwn-id scores into a dataframe, and then take the annual average across gwn-ids (for use in some blocs, like Vietnam)
    country_gwn_ids_years_democracy_scores_df = pd.DataFrame(country_gwn_ids_years_democracy_scores_list).T
    country_gwn_ids_years_democracy_scores = list(country_gwn_ids_years_democracy_scores_df.apply(lambda x:np.mean(x),axis=1))
    
    # Add a -11 polity score for countries that have no data in a given year (usually colonies)
    country_gwn_ids_years_democracy_scores = [-11 if not s >= -10 else s for s in country_gwn_ids_years_democracy_scores]
    
    return country_gwn_ids_years_democracy_scores

In [23]:
# Apply this function to all countries in maddison_country_years_df
country_democracy_scores = []
for c in tqdm_notebook(list(maddison_country_years_df["maddison_code"].unique())):
    country_democracy_scores = country_democracy_scores + get_country_historical_democracy_scores(c)




In [24]:
# Append these scores as a column to maddison_country_years_df
maddison_country_years_df["country_democracy_scores"] = country_democracy_scores

## Step 5: tidy up blocs and export data ##

In [25]:
# For countries that split or merge over the period, keep only the relevant rows
# Soviet states are treated as part of the Russian Empire / Soviet Union from 1900 to 1990
# Yugoslav states are treated as part of Yugoslavia from 1900 to 1990
# Czechoslovak states are treated as part of Czechoslovakia from 1900 to 1992
# Germany, Vietnam and Yemen are treated as their dual parts for the relevant years
year_bloc_dict = {"SUN":{"start_year":1900,"end_year":1990}, # Soviet Union
                  "ARM":{"start_year":1991,"end_year":2017}, # Armenia
                  "AZE":{"start_year":1991,"end_year":2017}, # Azerbaijan
                  "BLR":{"start_year":1991,"end_year":2017}, # Belarus
                  "EST":{"start_year":1991,"end_year":2017}, # Estonia
                  "GEO":{"start_year":1991,"end_year":2017}, # Georgia
                  "KAZ":{"start_year":1991,"end_year":2017}, # Kazakhstan
                  "KGZ":{"start_year":1991,"end_year":2017}, # Kyrgyzstan
                  "LVA":{"start_year":1991,"end_year":2017}, # Latvia
                  "LTU":{"start_year":1991,"end_year":2017}, # Lithuania
                  "MDA":{"start_year":1991,"end_year":2017}, # Moldova
                  "RUS":{"start_year":1991,"end_year":2017}, # Russia
                  "TJK":{"start_year":1991,"end_year":2017}, # Tajikistan
                  "TKM":{"start_year":1991,"end_year":2017}, # Turkmenistan
                  "UKR":{"start_year":1991,"end_year":2017}, # Ukraine
                  "UZB":{"start_year":1991,"end_year":2017}, # Uzbekistan
                  "YUG":{"start_year":1900,"end_year":1990}, # Yugoslavia
                  "BIH":{"start_year":1991,"end_year":2017}, # Bosnia
                  "HRV":{"start_year":1991,"end_year":2017}, # Croatia
                  "MKD":{"start_year":1991,"end_year":2017}, # Macedonia
                  "MNE":{"start_year":1991,"end_year":2017}, # Montenegro
                  "SRB":{"start_year":1991,"end_year":2017}, # Serbia
                  "SVN":{"start_year":1991,"end_year":2017}, # Slovenia
                  "CSK":{"start_year":1900,"end_year":1992}, # Czechoslovakia
                  "CZE":{"start_year":1993,"end_year":2017}, # Czech Republic
                  "SVK":{"start_year":1993,"end_year":2017}, # Slovakia
                  "GDR":{"start_year":1946,"end_year":1989}, # East Germany
                  "GFR":{"start_year":1946,"end_year":1989}, # West Germany
                  "DRV":{"start_year":1955,"end_year":1975}, # North Vietnam
                  "RVN":{"start_year":1955,"end_year":1975}, # South Vietnam
                  "YAR":{"start_year":1900,"end_year":1989}, # North Yemen
                  "YPR":{"start_year":1900,"end_year":1989}, # South Yemen
                  "YEM":{"start_year":1990,"end_year":2017} # Yemen
}

In [26]:
# Loop through year_bloc_dict, and drop rows from maddison_country_years_df in which countries were part of a different bloc
for k,v in year_bloc_dict.items():
    maddison_country_years_df = maddison_country_years_df[~((maddison_country_years_df["maddison_code"]==k) & ((maddison_country_years_df["year"] < v["start_year"]) | (maddison_country_years_df["year"] > v["end_year"])))]

In [27]:
# Identify countries in the Maddison data that have no Polity scores at all
polity_no_data_list = ["BRB", # Barbados 
 "DMA", # Dominica
 "HKG", # Hong Kong
 "ISL", # Iceland
 "LCA", # Saint Lucia
 "MLT", # Malta
 "PRI", # Puerto Rico
 "PSE", # Palestine
 "STP", # Sao Tome and Principe
 "SYC", # Seychelles
]

In [28]:
# Drop those countries from maddison_country_years_df
maddison_country_years_df = maddison_country_years_df[~maddison_country_years_df["maddison_code"].isin(polity_no_data_list)]

In [29]:
# Drop unified Vietnam and Germany from maddison_country_years_df in the years in which they were divided
maddison_country_years_df = maddison_country_years_df[~((maddison_country_years_df["maddison_code"]=="DEU") & (maddison_country_years_df["year"]>=1946) & (maddison_country_years_df["year"]<=1989))]
maddison_country_years_df = maddison_country_years_df[~((maddison_country_years_df["maddison_code"]=="VNM") & (maddison_country_years_df["year"]>=1955) & (maddison_country_years_df["year"]<=1975))]

In [30]:
maddison_country_years_df.reset_index(drop=True,inplace=True)
maddison_country_years_df

Unnamed: 0,maddison_code,country_name,country_region,year,population,gdp_per_capita,interpolated_gdp_per_capita,interpolated_population,country_year_conflict_deaths,country_year_conflict_death_rate,country_involved_in_conflict,country_democracy_scores
0,AFG,Afghanistan,Middle East,1900,,,1435.601006,4.883533e+06,0.0,0.000000,0,-6.0
1,AFG,Afghanistan,Middle East,1901,,,1454.728986,4.948646e+06,0.0,0.000000,0,-6.0
2,AFG,Afghanistan,Middle East,1902,,,1473.856966,5.013758e+06,0.0,0.000000,0,-6.0
3,AFG,Afghanistan,Middle East,1903,,,1492.984946,5.078871e+06,0.0,0.000000,0,-6.0
4,AFG,Afghanistan,Middle East,1904,,,1512.112925,5.143984e+06,0.0,0.000000,0,-6.0
5,AFG,Afghanistan,Middle East,1905,,,1531.240905,5.209097e+06,0.0,0.000000,0,-6.0
6,AFG,Afghanistan,Middle East,1906,,,1550.368885,5.274210e+06,0.0,0.000000,0,-6.0
7,AFG,Afghanistan,Middle East,1907,,,1569.496865,5.339323e+06,0.0,0.000000,0,-6.0
8,AFG,Afghanistan,Middle East,1908,,,1588.624845,5.404436e+06,0.0,0.000000,0,-6.0
9,AFG,Afghanistan,Middle East,1909,,,1607.752825,5.469549e+06,0.0,0.000000,0,-6.0


In [31]:
maddison_country_years_df.to_csv("output_data/maddison_and_polity_country_years_df.csv")