## Feature engineering, Feature creation and feature storage

- Date normalization


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Alternative to set svg for newer versions
%matplotlib inline


# Showing all the details of results
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# ccode int
min, max = data_all['ccode'].min(), data_all['ccode'].max()
data_all['ccode'] = (data_all['ccode']-min)/(max-min)

# entry, regular 1...
data_all['entry'] = data_all['entry'].replace({'Regular': 1,'Irregular': 2, 'Foreign Imposition': 3})
data_all['entry']=data_all['entry'].astype(float, errors='raise')
min, max = data_all['entry'].min(), data_all['entry'].max()
data_all['entry']= (data_all['entry']-min)/(max-min)

# exit, regular 1...
data_all['exit'] = data_all['exit'].replace({'Regular': 1, 'Irregular': 2, 'Foreign': 3,'Natural Death': 4, 
                                    'Retired Due to Ill Health': 5, 'Suicide': 6, 'Still in Office': 7}) 
                                    
data_all['exit']=data_all['exit'].astype(float, errors='raise')
min, max = data_all['exit'].min(), data_all['exit'].max()
data_all['exit']= (data_all['exit']-min)/(max-min)

# exitcode, regular 1...
data_all['exitcode'] = data_all['exitcode'].replace({'Assassination by Unsupported Individual': 1,'Irregular, Other': 2,
                                             'Popular Protest, with Foreign Support': 3, 
                                             'Popular Protest, without Foreign Support': 4,
                                             'Regular': 5, 'Removed by Military, with Foreign Support': 6,
                                             'Removed by Military, without Foreign Support': 7, 
                                             'Removed by Other Government Actors, with Foreign Support': 8,
                                             'Removed by Other Government Actors, without Foreign Support': 9,
                                             'Removed by Rebels, with Foreign Support': 10,
                                             'Removed by Rebels, without Foreign Support': 11,
                                             'Removed in Military Power Struggle Short of Coup': 12,
                                             'Removed through Threat of Foreign Force': 13, 'Unknown': 14,
                                             'Still in Office': 15}) 
data_all['exitcode']=data_all['exitcode'].astype(float, errors='raise')
min, max = data_all['exitcode'].min(), data_all['exitcode'].max()
data_all['exitcode']= (data_all['exitcode']-min)/(max-min)

In [None]:
# posttenurefate
data_all['posttenurefate'] = data_all['posttenurefate'].replace({'Death': 1, 'Exile': 2, 'Imprisonment': 3, 
                                     'Missing: Natural Death within Six Months of Losing Office': 4, 
                                     'Missing: No Information Found': 5, 'OK': 6, 'Suicide': 7, 
                                     'Leader Still in Office': 8, 'Missing: Left Office after*': 9}, regex=True) 
data_all['posttenurefate']=data_all['posttenurefate'].astype(float, errors='raise')
min, max = data_all['posttenurefate'].min(), data_all['posttenurefate'].max()
data_all['posttenurefate']= (data_all['posttenurefate']-min)/(max-min)

# prevtimesinoffice
min, max = data_all['prevtimesinoffice'].min(), data_all['prevtimesinoffice'].max()
data_all['prevtimesinoffice']= (data_all['prevtimesinoffice']-min)/(max-min)

# yrborn
min, max = data_all['yrborn'].min(), data_all['yrborn'].max()
data_all['yrborn']= (data_all['yrborn']-min)/(max-min)

# yrdied
min, max = data_all['yrdied'].min(), data_all['yrdied'].max()
data_all['yrdied']= (data_all['yrdied']-min)/(max-min)

# numentry
min, max = data_all['numentry'].min(), data_all['numentry'].max()
data_all['numentry']= (data_all['numentry']-min)/(max-min)

# numexit
min, max = data_all['numexit'].min(), data_all['numexit'].max()
data_all['numexit']= (data_all['numexit']-min)/(max-min)

# yrbegin
min, max = data_all['yrbegin'].min(), data_all['yrbegin'].max()
data_all['yrbegin']= (data_all['yrbegin']-min)/(max-min)

# yrend
min, max = data_all['yrend'].min(), data_all['yrend'].max()
data_all['yrend']= (data_all['yrend']-min)/(max-min)

# age
min, max = data_all['age'].min(), data_all['age'].max()
data_all['age']= (data_all['age']-min)/(max-min)

# tenure
min, max = data_all['tenure'].min(), data_all['tenure'].max()
data_all['tenure']= (data_all['tenure']-min)/(max-min)

# end_gdppc
min, max = data_all['end_gdppc'].min(), data_all['end_gdppc'].max()
data_all['end_gdppc']= (data_all['end_gdppc']-min)/(max-min)

# pop_x
min, max = data_all['pop_x'].min(), data_all['pop_x'].max()
data_all['pop_x']= (data_all['pop_x']-min)/(max-min)

# begin_gdppc
min, max = data_all['begin_gdppc'].min(), data_all['begin_gdppc'].max()
data_all['begin_gdppc']= (data_all['begin_gdppc']-min)/(max-min)

# pop_y
min, max = data_all['pop_y'].min(), data_all['pop_y'].max()
data_all['pop_y']= (data_all['pop_y']-min)/(max-min)

data_all.dropna(subset = ['growth_rate'], inplace=True)
data_all.shape

In [None]:
# delete unused data
del data_all['end_gdppc']
del data_all['country']

# Fig.2 of the correlations plot
# The correlations of all the variables used in this study sample: leaders' age, leaders' tenure
corr= data_all.corr()

# Visualize the corelation between each column
fig = plt.figure(figsize =(8,8))
corr_plot = plt.matshow(corr, cmap='RdBu', fignum = fig.number)
fig.colorbar(corr_plot)
plt.xticks(range(len(corr.columns)), corr.columns, rotation = 'vertical')
plt.yticks(range(len(corr.columns)), corr.columns)

### Group the tenure into five possible values

- 1 for single term - equal to or less than 4 years; 
- 2 for two terms - between 5 and 8 years;
- 3 for three terms -  between 9 and 12 years;
- 4 for four terms - between 13 and 16 years;
- 5 for five or more terms

In [None]:
# data.dtypes

# Visualize the differences about leaders' gender for the whole data set
# ax= sns.histplot(data.gender, shrink = 0.8, bins=10, color='grey') 
# import numpy as np
# data['fties_range'] = np.where(data['fties'].isnull(), 0.5, 1)
# data['gender'] = np.where(data['gender']=='M', 0.5, 1)
#data['logarithm_gdppc'] = np.log(data['gdppc'])
#data['normal_tenure'] = (data['tenure'] - data['tenure'].mean()) / data['tenure'].std()#normalize tenure with z-score
# corr= data.corr()

# #visualize the corelation between each column
# fig = plt.figure(figsize =(8,8))
# #plt.matshow(corr, cmap='RdBu', fignum = fig.number)
# plt.xticks(range(len(corr.columns)), corr.columns, rotation = 'vertical');
# sns.heatmap(corr, cmap='RdBu')
# plt.yticks(range(len(corr.columns)), corr.columns)
# fig.savefig('heatmap.png',dpi=600)

In [None]:
from scipy.stats import norm

# Group growth_rate into seven groups according to the central limit theorem
mu, std = norm.fit(data_all['growth_rate'])
conditions = [
    (data_all['growth_rate'] < -mu-3*std),
    (data_all['growth_rate'] >= -mu-3*std) & (data_all['growth_rate'] < -mu-2*std),
    (data_all['growth_rate'] >= -mu-2*std) & (data_all['growth_rate'] < -mu-std),
    (data_all['growth_rate'] >= -mu-std) & (data_all['growth_rate'] < mu+std),
    (data_all['growth_rate'] >= mu+std) & (data_all['growth_rate'] < mu+2*std),
    (data_all['growth_rate'] >= mu+2*std) & (data_all['growth_rate'] < mu+3*std),
    (data_all['growth_rate'] >= mu+3*std)
]

values = [7, 6, 5, 4, 3, 2, 1]

data_all['growth_rate_normgroup'] = np.select(conditions, values)

In [None]:
# Group growth_rate into seven groups according to min and max
min, max = data_all['growth_rate'].min(), data_all['growth_rate'].max()
conditions = [
    (data_all['growth_rate'] < min+1/7*(max-min)),
    (data_all['growth_rate'] >= min+1/7*(max-min)) & (data_all['growth_rate'] < min+2/7*(max-min)),
    (data_all['growth_rate'] >= min+2/7*(max-min)) & (data_all['growth_rate'] < min+3/7*(max-min)),
    (data_all['growth_rate'] >= min+3/7*(max-min)) & (data_all['growth_rate'] < min+4/7*(max-min)),
    (data_all['growth_rate'] >= min+4/7*(max-min)) & (data_all['growth_rate'] < min+5/7*(max-min)),
    (data_all['growth_rate'] >= min+5/7*(max-min)) & (data_all['growth_rate'] < min+6/7*(max-min)),
    (data_all['growth_rate'] >= min+6/7*(max-min))
]
    
values = [7, 6, 5, 4, 3, 2, 1]

data_all['growth_rate_avggrp'] = np.select(conditions, values)

In [None]:
# Save dataset to CSV file for AutoGluon
# data_all.to_csv(r'data_AutoGluon.csv', index = False)

In [None]:
data_all.columns