# Assignment 2
Gender peace agreement

# Setup

In [1]:
# Necessary import
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import pycountry
from pycountry_convert import country_name_to_country_alpha3

In [2]:
#read file helper, which works for both csv and excel format
def read_file(file_name : str):
    #getting file extensions
    typ=os.path.splitext(file_name)[1]
    
    if typ == ".xlsx":
        df = pd.read_excel(file_name,parse_dates=['Dat'])
    else: # Assuming "csv". can make this explicit
        df = pd.read_csv(file_name, engine="python",parse_dates=['Dat'])
        df.fillna(0)
        
    return df

The PA-X data files given are in xlsx format. When pandas reading xlsx format, it is slower than csv, so when the dataset is big, it is a good idea to convert xlsx to csv before reading file. In this case, the data file are not big, so using xlsx will not make a huge difference in performance, xlsx also helps to avoid the presence of NaN in our dataframe

In [3]:
# Read in PA-X gender data
gender_data = read_file('pax_wgg_all_agreements_data.xlsx')

# Read the entire PA-X data
full_data = read_file('pax_all_agreements_data.xlsx')

# taking a look at the dataframe making sure its running properly
gender_data.head()

Unnamed: 0,Con,Contp,PP,PPName,Reg,AgtId,Ver,Agt,Dat,Status,...,WggRehab,WggEdu,WggHea,WggRepro,WggDevOth,WggImpl,WggImplRole,WggImplSign,WggImplOth,WggOth
0,Afghanistan,Government/territory,2,Afghanistan: 2000s Post-intervention process,Europe and Eurasia,2232,3,Resolution of Intra Afghan Peace Conference in...,2019-07-08,Multiparty signed/agreed,...,0,0,0,0,0,0,0,0,0,0
1,Afghanistan,Government,2,Afghanistan: 2000s Post-intervention process,Europe and Eurasia,1739,2,Agreement between the Islamic Republic of Afgh...,2016-09-22,Multiparty signed/agreed,...,0,0,0,0,0,0,0,0,0,0
2,Afghanistan,Government,2,Afghanistan: 2000s Post-intervention process,Europe and Eurasia,1923,2,Agreement between the two campaign teams regar...,2014-09-21,Multiparty signed/agreed,...,0,0,0,0,0,0,0,0,0,0
3,Afghanistan,Government,2,Afghanistan: 2000s Post-intervention process,Europe and Eurasia,864,1,Tokyo Declaration Partnership for Self-Relianc...,2012-07-08,Multiparty signed/agreed,...,1,1,1,0,0,0,0,0,0,0
4,Afghanistan,Government,2,Afghanistan: 2000s Post-intervention process,Europe and Eurasia,848,1,Conclusions of the Conference on Afghanistan a...,2011-12-05,Multiparty signed/agreed,...,0,1,0,0,0,0,0,0,0,0


In [6]:
grouped_df=gender_data.groupby('Con').size().reset_index(name='counts')

#get country names and counts
country_name=list(grouped_df['Con'])
country_count=list(grouped_df['counts'])

#zip two list into tuple
_tuple=zip(country_name,country_count)


ISO_names=[]
ISO_count=[]

#loop the tuple list
for name, count in _tuple:
    try:
        #if there is a / present in the string, then we take the first element for country name
        #I did it this way to reduce code complexity,
        #When a agreement have multiple countries in 'Con', they are all countries next to each other
        #so doing it this have minimal effect on the distribution
        if '/' in name:
            name=(name.split('/'))[0]
            ISO_names.append(country_name_to_country_alpha3(name))
            ISO_count.append(count)
        else:
            ISO_names.append(country_name_to_country_alpha3(name))
            ISO_count.append(count)
    except:
        #there are some value in 'Con' that is not reconized as a country, so cant find ISO name for it, passing it in this case
        pass

world_col_names =  ['country', 'count']
world_df  = pd.DataFrame(columns = world_col_names)
world_df['country'] = ISO_names
world_df['count'] =  ISO_count
world_df
world_df=world_df.groupby('country').sum()

world_df.reset_index(level=0,inplace=True)


fig = px.scatter_geo(world_df, locations="country",size="count")
fig.update_layout(title_text="Distribution of WGG agreements on world map")
fig.show()

In [35]:
def year_count(df):
    all_years=list(df['Dat'].apply(lambda x: x.year))
    year_set=set(all_years)
    year_count=[]

    for year in list(year_set):
        year_count.append(all_years.count(year))
    return year_count, year_set
    

In [39]:
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
import time

wgg_count, wgg_set =year_count(gender_data)
full_count, full_set =year_count(full_data)
    

c = (
    Bar()
    .add_xaxis(list(wgg_set))
    .add_yaxis("WGG count", wgg_count)
    .add_yaxis("PA-X count", full_count)
    .set_global_opts(
        title_opts=opts.TitleOpts(title=""),
        datazoom_opts=opts.DataZoomOpts(),
    )
    .render("bar_datazoom_slider.html")
)