# Analysing Political Executives with AI

In [1]:
# Install the packages for data anlysis in Python
# Install the packages for data analysis and visualisation
# !pip install seaborn pandas matplotlib numpy IPython openpyxl

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Alternative to set svg for newer versions
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Showing all the details of results
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Built a total dataset

- Merging data from other files
- Read the data from Archigos excel document/dta data


In [3]:
# Read the original dta(Stata) data download from Archigos website.
# load the data
datafile = 'Archigos_4.1_stata14.dta'
data = pd.read_stata(datafile)

# Startdate is better than eindate, enddate is better than eoutdate
# Change the data type of startdate and enddate to datatime
data['startdate'] = data['startdate'].astype('datetime64[ns]')
data['enddate'] = data['enddate'].astype('datetime64[ns]')
data['yrbegin'] = pd.DatetimeIndex(data['startdate']).year
data['yrend'] = pd.DatetimeIndex(data['enddate']).year

# Import the data about countries grouped by three categories: Autocracy, Anocracy, Democracy
data_deau = pd.read_excel('DeorAu_group.xlsx')

# load another excel file mpd2020 with gdppc and pop
data_mpd2020 = pd.read_excel('mpd2020.xlsx', 'Full data')

# Combine data_deau with data_leader with key id=countries
# Rename the Country-id to idacr in data_deau
data_deau = data_deau.rename({'Country-id': 'idacr'}, axis='columns')
data_leader_deau = pd.merge(data, data_deau, how ='left', on ='idacr')

# Rename the countrycode to idacr in data_mpd2020
# Rename the year in gdppc to yrend to merge data
data_mpd2020 = data_mpd2020.rename({'countrycode': 'idacr', 'year': 'yrend'}, axis='columns')

# Change the float to int
data_leader_deau['yrend'] = data_leader_deau['yrend'].fillna(0.0).astype(int)
data_leader_deau['yrbegin'] = data_leader_deau['yrbegin'].fillna(0.0).astype(int)

# Combine data_leaders_deau with data_mpd2020 with key idacr and year
data_leader_deau_gdp = pd.merge(data_leader_deau, data_mpd2020, how ='left', on =['idacr', 'yrend'])

# Rename the gdppc to end_gdppc in data_leaders_deau_gdp
data_leader_deau_gdp = data_leader_deau_gdp.rename({'gdppc': 'end_gdppc'}, axis='columns')

data_gdp = pd.read_excel('data_gdp.xlsx')
data_gdp = data_gdp.rename({'year': 'yrbegin'}, axis='columns')
data_all = pd.merge(data_leader_deau_gdp, data_gdp, how ='left', on =['idacr', 'yrbegin'])

# Rename the gdppc to begin_gdppc in data_all
data_all = data_all.rename({'gdppc': 'begin_gdppc'}, axis='columns')

# Calculate the leaders' age, when they first into power
# See the characteristics of yrborn, there is something wrong in the data. 
# Because I change the data type of yrborn, from int to datetime.

# age= yrbegin- yrborn
data_all['age'] = data_all['yrbegin'] - data_all['yrborn']

# See the distribution of age
# data_all.groupby(['age']).mean()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.


## Calculate their tenure
 
- Calculate the tenure
- Check the data about our data about tenure 
- Delete tenure < 1
- Group the data by leaders and sum their tenure together ？？

In [4]:
# tenure= enddate or eoutdate - startdate  or  eindate (Unit: years)
# if enddate is null, it has two possibilities: 
# 1. leader is still in office, use the date now; 2. not available.  

import datetime

data_all['startdate'] = pd.to_datetime(data_all['startdate']).dt.date
data_all['enddate'] = pd.to_datetime(data_all['enddate']).dt.date

data_all['enddate'] = np.where(data_all['enddate'].isnull(),
                                datetime.datetime.now().date(),
                                data_all['enddate'])

data_all['tenure'] = (data_all['enddate'] - data_all['startdate'])/np.timedelta64(1, 'Y')

# data_all.groupby(['tenure']).mean()

data_all.drop(data_all[data_all['tenure'] < 1].index , inplace=True)

# Add tenure to the excel grouped by leaders
# Some leaders may have more than one term
# Tenure: merge the duplicates and sum the tenure.
# data_all=data_all.groupby(['leader']).sum()

# Check if there are any abonormal data_all.
# data_all.groupby(['tenure']).mean()

In [5]:

# calulate the growth rate
data_all['growth_rate'] = (data_all['end_gdppc'] / data_all['begin_gdppc'] - 1) / data_all['tenure']

# data_all.describe()

data_all.dropna(subset = ['growth_rate'], inplace=True)

In [6]:
data_all.dtypes

obsid                                        object
leadid                                       object
ccode                                         int16
idacr                                        object
leader                                       object
startdate                                    object
eindate                              datetime64[ns]
enddate                                      object
eoutdate                             datetime64[ns]
entry                                        object
exit                                         object
exitcode                                     object
prevtimesinoffice                              int8
posttenurefate                               object
gender                                       object
yrborn                                        int16
yrdied                                        int16
borndate                                     object
ebirthdate                           datetime64[ns]
deathdate   

## Data cleaning

### Delete the irrelevant data with irrelevant columns

In [7]:
# 'country_x','country_y'
data_all.drop(columns=['obsid',
                   'leadid',
                   'idacr',
                   'startdate',
                   'eindate',
                   'enddate',
                   'eoutdate',
                   'borndate',
                   'ebirthdate',
                   'deathdate',
                   'edeathdate',
                   'dbpediauri',
                  'numexitcode',
                  'numposttenurefate',
                  'ftcur',
                  'Country',
                  'Polity datasets IV number[11][12]',
                  'Polity datasets IV category'], inplace=True)

# Deal with family ties and gender 
data_all['fties_range'] = np.where(data_all['fties'].isnull(), 0.5, 1)
data_all['gender'] = np.where(data_all['gender']=='M', 0.5, 1)


data_all.drop(columns=['fties',
                       'Democracy',
                       'Autocracy'], inplace=True)
                       
# Delete the columns that more than 30% is null 
# Delete the column >= 30% is null, we can see the remaining column names.
null_sum=data_all.isnull().sum()
data_all.columns[null_sum<len(data_all)*0.3] # columns will keep

# Drop the columns that at least 30% values are null
data_all.drop(columns=data_all.columns[null_sum > len(data_all)*0.3], inplace=True)

# Check if there are any abnormal values, we need to pay attention to the negative numbers. 
# Some leaders are still alive, so the yrdied is negative number, etc.
# print(data.describe())

Index(['ccode', 'leader', 'entry', 'exit', 'exitcode', 'prevtimesinoffice',
       'posttenurefate', 'gender', 'yrborn', 'yrdied', 'numentry', 'numexit',
       'yrbegin', 'yrend', 'country', 'end_gdppc', 'pop_x', 'begin_gdppc',
       'pop_y', 'age', 'tenure', 'growth_rate', 'fties_range'],
      dtype='object')

In [8]:
data_all.dropna(subset = ['leader'], inplace=True)

In [None]:
# First we see the characteristics of entry, there are four different kinds of types.
# data.groupby(['entry']).mean()

# Group the leaders by entry, see the compare between each group.
# ax= sns.histplot(data['entry'])

# Second we see the characteristics of exit, there are eight different kinds of types.
# data.groupby(['exit']).mean()

# Group the leaders by exit, see the compare between each group.
# sns.countplot(y='exit',data=data)

In [9]:
# Delete if yrbegin > yrend, age < 0
# print(data_all[data_all['yrbegin'] > data_all['yrend']]) #age < 0
data_all.drop(data_all[data_all['yrbegin'] > data_all['yrend']].index, inplace=True)

(data_all['yrbegin'] < 0).count() #yrbegin > 0

# Delete the data_all yrbegin < 0, yrend < 0, yrborn<0, yrdied<0
data_all.drop(data_all[data_all['yrborn']<0].index , inplace=True) 
data_all.drop(data_all[data_all['yrdied']<0].index, inplace=True)
data_all.drop(data_all[data_all['yrbegin']<0].index , inplace=True) 
data_all.drop(data_all[data_all['yrend']<0].index, inplace=True)
# print(data_all.shape)

# Find if yrbegin > yrdied, lifespan < 0 
# print(data_all[data_all['yrbegin'] > data_all['yrdied']]) # begin to come into power after the leader is died.

# Delete the data_all yrbegin > yrdied
data_all.drop(data_all[data_all['yrbegin'] > data_all['yrdied']].index, inplace=True) 

922

### Duplicated leader with more than one age, we pick up the first??

In [None]:
# # Find duplicates and sum the duplicates.
# print(data.leader.duplicated().sum())

# # Sum the non-duplicates.
# print((~data.leader.duplicated()).sum())

# # The first occurrence gets kept, and all others get identified as duplicates
# data.leader=data.leader.drop_duplicates(keep='first')

# # Save the groupby leader dataframe
# data_leader = data.groupby(['leader'])
# data.to_excel(r'data_all_leaders_group.xlsx', index = False)

# print(data_leader.dtypes)

# data_leader.describe()

### Draw a figure about age

In [None]:
# # See the distribution of age
# import seaborn as sns
# ax = sns.histplot(data.age, kde=True)

# import seaborn as sns
# sns.histplot(data.age,bins=[1, 30, 40, 50, 60, 70, 80, 90] ,color = 'orange', shrink=.9)

### Group the tenure into five possible values

- 1 for single term - equal to or less than 4 years; 
- 2 for two terms - between 5 and 8 years;
- 3 for three terms -  between 9 and 12 years;
- 4 for four terms - between 13 and 16 years;
- 5 for five or more terms)</font> 

In [None]:
# Divied tenure group to five groups
# tenure_groups= pd.cut(data['tenure'], bins=[1, 4, 8, 12, 16, 20, np.inf])
# data.groupby(tenure_groups).mean()

# See the distribution of tenure
# ax = sns.histplot(data.tenure,kde=True, shrink = 0.8, bins=20) 

                 #, element="step", fill=False)

In [None]:
# See the distribution of tenure
# sns.histplot(data.tenure,bins=[0, 4, 8, 12, 16, 20, np.inf], color = 'pink', shrink=.9)

In [None]:
# data.dtypes

# Visualize the differences about leaders' gender for the whole data set
# ax= sns.histplot(data.gender, shrink = 0.8, bins=10, color='grey') 

In [None]:
# # The correlations of all the variables used in this study sample: leaders' age, leaders' tenure
# corr= data.corr()
# corr

# # Visualize the corelation between each column
# fig = plt.figure(figsize =(8,8))
# plt.matshow(corr, cmap='RdBu', fignum = fig.number)
# plt.xticks(range(len(corr.columns)), corr.columns, rotation = 'vertical');
# plt.yticks(range(len(corr.columns)), corr.columns);

In [None]:
# The relationship between age and tenure
# data.plot(kind = 'scatter', x = 'age', y = 'tenure', figsize = (6,6))

# Save the py file to excel
# data.to_excel('data_all_leaders_group.xlsx', index = False)
# Tranfert the .py file to excel

In [None]:
# import seaborn as sns
# ax= sns.histplot(data_all['growth_rate'])

In [None]:
# import numpy as np
# data['fties_range'] = np.where(data['fties'].isnull(), 0.5, 1)
# data['gender'] = np.where(data['gender']=='M', 0.5, 1)
#data['logarithm_gdppc'] = np.log(data['gdppc'])
#data['normal_tenure'] = (data['tenure'] - data['tenure'].mean()) / data['tenure'].std()#normalize tenure with z-score

In [None]:
# corr= data.corr()

# #visualize the corelation between each column
# fig = plt.figure(figsize =(8,8))
# #plt.matshow(corr, cmap='RdBu', fignum = fig.number)
# plt.xticks(range(len(corr.columns)), corr.columns, rotation = 'vertical');
# sns.heatmap(corr, cmap='RdBu')
# plt.yticks(range(len(corr.columns)), corr.columns)
# fig.savefig('heatmap.png',dpi=600)

### Normalization

In [11]:
#ccode int
min, max = data_all['ccode'].min(), data_all['ccode'].max()
data_all['ccode'] = (data_all['ccode']-min)/(max-min)

#entry, regular 1...
data_all['entry'] = data_all['entry'].replace({'Regular': 1,'Irregular': 2, 'Foreign Imposition': 3})
data_all['entry']=data_all['entry'].astype(float, errors='raise')
min, max = data_all['entry'].min(), data_all['entry'].max()
data_all['entry']= (data_all['entry']-min)/(max-min)

#exit, regular 1...
data_all['exit'] = data_all['exit'].replace({'Regular': 1, 'Irregular': 2, 'Foreign': 3,'Natural Death': 4, 
                                    'Retired Due to Ill Health': 5, 'Suicide': 6}) 
data_all['exit']=data_all['exit'].astype(float, errors='raise')
min, max = data_all['exit'].min(), data_all['exit'].max()
data_all['exit']= (data_all['exit']-min)/(max-min)

#exitcode, regular 1...
data_all['exitcode'] = data_all['exitcode'].replace({'Assassination by Unsupported Individual': 1,'Irregular, Other': 2,
                                             'Popular Protest, with Foreign Support': 3, 
                                             'Popular Protest, without Foreign Support': 4,
                                             'Regular': 5, 'Removed by Military, with Foreign Support': 6,
                                             'Removed by Military, without Foreign Support': 7, 
                                             'Removed by Other Government Actors, with Foreign Support': 8,
                                             'Removed by Other Government Actors, without Foreign Support': 9,
                                             'Removed by Rebels, with Foreign Support': 10,
                                             'Removed by Rebels, without Foreign Support': 11,
                                             'Removed in Military Power Struggle Short of Coup': 12,
                                             'Removed through Threat of Foreign Force': 13, 'Unknown':14}) 
data_all['exitcode']=data_all['exitcode'].astype(float, errors='raise')
min, max = data_all['exitcode'].min(), data_all['exitcode'].max()
data_all['exitcode']= (data_all['exitcode']-min)/(max-min)

In [None]:
#posttenurefate
data_all['posttenurefate'] = data_all['posttenurefate'].replace({'Death': 1, 'Exile': 2, 'Imprisonment': 3, 
                                     'Missing: Natural Death within Six Months of Losing Office': 4,
                                     'Missing: No Information Found': 5, 'OK': 6, 'Suicide': 7}) 
data_all['posttenurefate']=data_all['posttenurefate'].astype(float, errors='raise')
min, max = data_all['posttenurefate'].min(), data_all['posttenurefate'].max()
data_all['posttenurefate']= (data_all['posttenurefate']-min)/(max-min)

#prevtimesinoffice
min, max = data_all['prevtimesinoffice'].min(), data_all['prevtimesinoffice'].max()
data_all['prevtimesinoffice']= (data_all['prevtimesinoffice']-min)/(max-min)

#yrborn
min, max = data_all['yrborn'].min(), data_all['yrborn'].max()
data_all['yrborn']= (data_all['yrborn']-min)/(max-min)

#yrdied
min, max = data_all['yrdied'].min(), data_all['yrdied'].max()
data_all['yrdied']= (data_all['yrdied']-min)/(max-min)

#numentry
min, max = data_all['numentry'].min(), data_all['numentry'].max()
data_all['numentry']= (data_all['numentry']-min)/(max-min)

#numexit
min, max = data_all['numexit'].min(), data_all['numexit'].max()
data_all['numexit']= (data_all['numexit']-min)/(max-min)

#delete numexitcode, numposttenurefate
data_all.drop([  
'numexitcode',
'numposttenurefate',
    'fties',
    'ftcur',
    'Polity datasets IV category'
], axis=1, inplace=True)

#yrbegin
min, max = data_all['yrbegin'].min(), data_all['yrbegin'].max()
data_all['yrbegin']= (data_all['yrbegin']-min)/(max-min)

#yrend
min, max = data_all['yrend'].min(), data_all['yrend'].max()
data_all['yrend']= (data_all['yrend']-min)/(max-min)

#age
min, max = data_all['age'].min(), data_all['age'].max()
data_all['age']= (data_all['age']-min)/(max-min)

#tenure
min, max = data_all['tenure'].min(), data_all['tenure'].max()
data_all['tenure']= (data_all['tenure']-min)/(max-min)

#end_gdppc
min, max = data_all['end_gdppc'].min(), data_all['end_gdppc'].max()
data_all['end_gdppc']= (data_all['end_gdppc']-min)/(max-min)

#pop_x
min, max = data_all['pop_x'].min(), data_all['pop_x'].max()
data_all['pop_x']= (data_all['pop_x']-min)/(max-min)

#begin_gdppc
min, max = data_all['begin_gdppc'].min(), data_all['begin_gdppc'].max()
data_all['begin_gdppc']= (data_all['begin_gdppc']-min)/(max-min)

#pop_y
min, max = data_all['pop_y'].min(), data_all['pop_y'].max()
data_all['pop_y']= (data_all['pop_y']-min)/(max-min)

data_all.dropna(subset = ['growth_rate'], inplace=True)
data_all.shape

# Look at the distribution of the data_all put into the model
ax = sns.histplot(data_all['growth_rate'])

mean = data_all['growth_rate'].mean()
std = data_all['growth_rate'].std()
print(mean)
print(std)

In [None]:
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt

# Generate some data for this
# demonstration.
dta = data['growth_rate']

# Fit a normal distribution to
# the data:
# mean and standard deviation
mu, std = norm.fit(dta)

# Plot the histogram.
plt.hist(dta, bins=100, density=True, alpha=0.6)

# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)

plt.plot(x, p, 'k', linewidth=2)
# title = "Fit Values: {:.2f} and {:.2f}".format(mu, std)
# plt.title(title)

plt.show()


In [None]:
data['growth_rate'] = data['growth_rate']*10

ax = sns.histplot(data.growth_rate,bins=[-0.4, -0.05, 0, 0.05, 0.15, 0.5], color ='pink', shrink=.9)

In [None]:
x = data['growth_rate']
y = np.histogram(data['growth_rate'])
print(x)
print(y)

plt.hist(data['growth_rate'], bins=100, density=True, alpha=0.5, histtype='stepfilled' )

In [None]:
# group growth_rate into seven groups according to the central limit theorem
mu, std = norm.fit(data['growth_rate'])
conditions = [
    (data['growth_rate'] < -mu-3*std),
    (data['growth_rate'] >= -mu-3*std) & (data['growth_rate'] < -mu-2*std),
    (data['growth_rate'] >= -mu-2*std) & (data['growth_rate'] < -mu-std),
    (data['growth_rate'] >= -mu-std) & (data['growth_rate'] < mu+std),
    (data['growth_rate'] >= mu+std) & (data['growth_rate'] < mu+2*std),
    (data['growth_rate'] >= mu+2*std) & (data['growth_rate'] < mu+3*std),
    (data['growth_rate'] >= mu+3*std)
]

values = [7, 6, 5, 4, 3, 2, 1]

data['growth_rate_normgroup'] = np.select(conditions, values)

In [None]:
# group growth_rate into seven groups according to min and max
min, max = data['growth_rate'].min(), data['growth_rate'].max()
conditions = [
    (data['growth_rate'] < min+1/7*(max-min)),
    (data['growth_rate'] >= min+1/7*(max-min)) & (data['growth_rate'] < min+2/7*(max-min)),
    (data['growth_rate'] >= min+2/7*(max-min)) & (data['growth_rate'] < min+3/7*(max-min)),
    (data['growth_rate'] >= min+3/7*(max-min)) & (data['growth_rate'] < min+4/7*(max-min)),
    (data['growth_rate'] >= min+4/7*(max-min)) & (data['growth_rate'] < min+5/7*(max-min)),
    (data['growth_rate'] >= min+5/7*(max-min)) & (data['growth_rate'] < min+6/7*(max-min)),
    (data['growth_rate'] >= min+6/7*(max-min))
]
    
values = [7, 6, 5, 4, 3, 2, 1]

data['growth_rate_avggrp'] = np.select(conditions, values)

In [None]:
# group growth_rate in desc
sorted_data = data.sort_values("growth_rate", ascending=False)

In [None]:
print(sorted_data)

In [None]:
# Group leaders into seven groups each group with approximately same leaders, the rest is placed in the last group
groups = range(1,8)
data['growth_rate_avggrp'] = np.concatenate(
                [np.repeat(groups,int(len(data)/7)), 
                 np.repeat([7],int(len(data)%7))])

In [None]:
ax = sns.histplot(data['growth_rate_avggrp'])

In [None]:
# print(data['growth_rate_avggrp'])

In [None]:
data.describe()

In [None]:
ax = sns.histplot(data['age'])

In [None]:
ax = sns.histplot(data['tenure'])

In [None]:
ax = sns.histplot(data_com3['gender'])

In [None]:
ax = sns.histplot(data['normal_tenure'])

In [None]:
#drop growth_rate
data.drop(['growth_rate'], axis=1, inplace=True)

In [None]:
#delete growth_rate
data.drop([  
'growth_rate'
], axis=1, inplace=True)

In [None]:
#data.to_csv(r'/Users/zhaoshuai/Desktop/DA/Data/Data-2016/data_group_nd.csv', index = False)
data.to_csv(r'/Users/zhaoshuai/Desktop/DA/Data/Data-2016/data_norm_avggrp.csv', index = False)
# tranfert the .py file to excel/csv

In [None]:
#test automl model
#install autogluon
!pip install autogluon
!pip install "mxnet<2.0.0"

In [None]:
#train data
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split #splitting the dataset

In [None]:
import numpy as np

In [None]:
train_data = TabularDataset('/Users/zhaoshuai/Desktop/DA/Data/Data-2016/data_normgroup.csv')

In [None]:
train_data = TabularDataset('/Users/zhaoshuai/Desktop/DA/Data/Data-2016/data_norm_avggrp.csv')

In [None]:
train_data = TabularDataset('/Users/zhaoshuai/Desktop/DA/Data/Data-2016/data_allvar.csv')

In [None]:
type(train_data)

In [None]:
id, label = 'leader', 'growth_rate'
#data cleaning
#large_val_cols = ['age','tenure']
#for c in large_val_cols:
#    train_data[c] = np.log(train_data[c]+1)

In [None]:
#change label into growth_rate group
id, label = 'leader', 'growth_rate_arggrp'
#data cleaning
#large_val_cols = ['age','tenure']
#for c in large_val_cols:
#    train_data[c] = np.log(train_data[c]+1)

In [None]:
train_data.columns

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
#split train data and test data
df_train,df_test=train_test_split(train_data, test_size=0.33, random_state=1)
df_train.shape, df_test.shape

In [None]:
predictor = TabularPredictor(label=label).fit(
    df_train.drop(columns=[id, 'leader']))

In [None]:
predictor.leaderboard(df_test.drop(columns=[id, 'leader']), silent=True)

In [None]:
predictor.feature_importance(df_test, subsample_size=None)

In [None]:
predictor.evaluate(df_test.drop(columns=[id, 'leader']))

In [None]:
#better model with GPU, data contain large amout of text, use multimodal to transform characteristics,
#multi-model fusion
#hyperparameters='multimodal',
#num_stack_levels=1, num_bag_folds=5

<font color='red' size=0>7.4 Group the data by Polity datasets IV category</font> 

In [None]:
#Group by the three types, and count the number of each group of the three
df1.groupby(['Polity datasets IV category']).count()

<font color='red' size=0>7.5 Draw a figure to see the difference between each group in Polity datasets</font> 

In [None]:
#Use seaborn package to visualize the difference between the three types
import seaborn as sns
ax= sns.histplot(df1['Polity datasets IV category'])

<font color='red' size=0>7.6 Group the data by countries</font> 

In [None]:
df1.groupby(['Country-id']).mean()

In [None]:
print(df1)

<font color='blue' size=2> Deal with the dataset with the name df2(Countries vs gdppc)</font> 

<font color='red' size= 2>8. Read the data about gdppc from excel with the name df2</font> 

In [None]:
#Import the data about countries with yearly Real GDP per capita in 2011$
df2 = pd.read_excel (r'D:/Zhao Shuai/phd/lingnan/literature/Data/mpd2020.xlsx', sheet_name='Full data')

<font color='red' size=0>8.1 Summary about df2</font> 

In [None]:
df2.dtypes

In [None]:
#See the overall head of df2
df2.head

In [None]:
#Shows the rows count and the types
df2.info()

In [None]:
df2.describe()

<font color='red' size=0>8.2 Choose the data after 1945</font>

In [None]:
df2.drop( df2[ df2['year'] < 1945 ].index , inplace=True) 
print(df2)

<font color='red' size=0>8.3 Delete the NaN and  Null value and abnormal data</font> 

In [None]:
#Check all the data
#Delete the NaN and  Null value and abnormal data
df2.describe(include = 'all')

In [None]:
#let’s see what is the proportion of NaN values using a visualization library called seaborn.
#In this heatmap, you can see in yellow (depending on the cmap you are using, but with mine it’s yellow) the NaN values in each column.
sns.heatmap(df2.isnull(), cmap='viridis')

In [None]:
#We will just keep the rows with a value in the gdppc column
df2 = df2.dropna(subset=['gdppc'])
print(df2)

<font color='red' size=0>8.4 Check the data</font> 

In [None]:
print(df2)

In [None]:
#Check whether gdppc has abnormal numbers or not
df2.groupby(['gdppc']).mean()

In [None]:
#Check whether country has abnormal numbers or not
df2.groupby(['country']).mean()

<font color='red' size=0>8.5 Delete the 0 in gdppc</font> 

In [None]:
#Deleter the 0 in gdppc
#df2.loc[(df2!='0.0000').any(axis='colums')]
df2= df2[df2['gdppc'] != 0]
print(df2)

In [None]:
#Check if I delete the gdppc with 0
df2.groupby('gdppc').mean()

<font color='red' size=0>8.6 Group by country</font> 

In [None]:
#Group by country, and save the dataframe with name df2_cty, gdppc is the average value in these years.
df2_cty = df2.groupby('country').mean()
print(df2_cty)

In [None]:
#Check the type for each column
df2_cty.dtypes

In [None]:
#Because I have more than 100 countries, so I choose the first 30 to check
df2_cty.head(30)

In [None]:
#Visualize the first 30 countries by gdppc
df2_cty.head(30).plot(kind='barh', y= 'gdppc')
#plt.savefig('medals.svg')

<font color='red' size=0>8.7 Draw a boxplot to show the gdppc for different countries</font>

In [None]:
#Show the gdppc of each country

#plt.savefig('medals.svg')

In [None]:
#Data visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
display.set_matplotlib_formats('svg')

<font color='red' size=0>8.8 Draw a line to show the changes of U.S. gdppc over time</font>

In [None]:
#Choose a single country, for example, choose the subset of United States
df2_usa = df2[df2.country == 'United States']
print(df2_usa)

In [None]:
#The change in per capita GDP of a country over time
plt.figure(figsize=(10,5))
sns.relplot(x="year", y="gdppc", kind="line", data=df2_usa)

In [None]:
# Create figure and plot space, what's wrong?
fig, ax = plt.subplots(figsize=(10, 6))

# Add x-axis and y-axis
ax.plot(df2['year'],
        df2['gdppc'],
        df2['country'] == 'United States', 
        color='blue',)

# Set title and labels for axes
ax.set(xlabel="Year",
       ylabel="Real GDP per capita in 2011$",
       title="Gdppc in different year")

plt.show()

<font color='red' size=0>8.9 Choose the top 20 countries with more data in gdppc</font>

In [None]:
df2['country'].value_counts()[0:20]

<font color='red' size=0>8.10 Compare the gdppc between China and United States</font>

In [None]:
#Choose China and United States in country
compare=df2['country'].isin(['China','United States'])
#sns.displot(pd.DataFrame({'gdppc': df2[compare]['year'],'country': df2[compare]['country']}),x= 'gdppc', hue='country', kind='kde');

#sns.set(style='darkgrid',)
 
# Draw line plot of size and total_bill with parameters and hue "day"
sns.lineplot(x = "year", y = "gdppc", data = df2[compare])
 
plt.title("Line", fontsize = 15)
plt.xlabel("Year", fontsize = 15)
plt.ylabel("gdppc", fontsize = 15)
plt.show()

In [None]:
#Compare their gdppc for max, min, mean, etc
ax = sns.boxplot (x = 'country', y = 'gdppc', data = df2[compare], fliersize=0)
ax.set_ylim([0, 45000]);

<font color='blue'  size=2>Analyze data across diferent py tables</font>

<font color='red'  size=2>See the relationship between national leaders' gender, tenure and gdppc,TBC</font>

In [None]:
#ax = plt.subplots (figsize = (6,6))
#columns = ['']
#Covariance to be continued

<font color='red'  size=2>I wanted to use a map to show the tenure and gender of the leaders of various countries, but it didn't happen.TBC</font> 

In [None]:
import folium

In [None]:
#unction to convert to alpah2 country codes and continents
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
def get_continent(col):
    try:
        cn_a2_code =  country_name_to_country_alpha2(col)
    except:
        cn_a2_code = 'Unknown' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Unknown' 
    return (cn_a2_code, cn_continent)
get_continent(dta2016.idacr)

<font color='red'  size=2>Get the lantitude and longitude for each country,TBC</font>

In [None]:
#installation
!pip install pycountry-convert
#function to convert to alpah2 country codes and continents
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
def get_continent(col):
    try:
        cn_a2_code =  country_name_to_country_alpha2(df2)
    except:
        cn_a2_code = 'Unknown' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Unknown' 
    return (cn_a2_code, cn_continent)


In [None]:
print(df2)

In [None]:
!pip install geopy

In [None]:
#function to get longitude and latitude data from country name
from geopy.geocoders import Nominatim
geolocator = Nominatim()
def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan
    df2["Longitude"] = longitude
df2["Latitude"] = latitude

In [None]:
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
# declare an empty list to store
# latitude and longitude of values 
# of cuntry column
longitude = []
latitude = []
   
# function to find the coordinate
# of a given city 
def findGeocode(country):
       
    # try and catch is used to overcome
    # the exception thrown by geolocator
    # using geocodertimedout  
    try:
          
        # Specify the user_agent as your
        # app name it should not be none
        geolocator = Nominatim(user_agent="your_app_name")
          
        return geolocator.geocode(country)
      
    except GeocoderTimedOut:
          
        return findGeocode(country)    
  
# each value from country column
# will be fetched and sent to
# function find_geocode   
for i in (df2["country"]):
      
    if findGeocode(i) != None:
           
        loc = findGeocode(i)
          
        # coordinates returned from 
        # function is stored into
        # two separate list
        latitude.append(loc.latitude)
        longitude.append(loc.longitude)
       
    # if coordinate for a city not
    # found, insert "NaN" indicating 
    # missing value 
    else:
        latitude.append(np.nan)
        longitude.append(np.nan)
       
df2["Longitude"] = longitude
df2["Latitude"] = latitude

df2

In [None]:
!pip install geolocator
import geolocator
loc = geolocator.geocode(df2.country, exactly_one=False)

In [None]:
!pip install autogluon