In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import shapiro

In [6]:
spi_data = pd.read_csv("../data/SPI/SPI_index.csv")

In [7]:
spi_data["date"].unique()

array([2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009,
       2008, 2007, 2006, 2005, 2004])

## Normality test for SPI Index

In [8]:
shapiro_failed_countries = []
for name, group in spi_data.groupby(by="country", as_index=False):
    na_dropped = group["SPI.INDEX"].dropna()
    if len(na_dropped) < 3:
        continue
        
    threshold = 0.05
    shapiro_pvalue = shapiro(na_dropped)[1]

    if shapiro_pvalue <= threshold:
        shapiro_failed_countries.append((name, shapiro_pvalue, len(na_dropped)))

In [9]:
print("{} countries out of {} countries failed the Shapiro-Wilk test for normality of the SPI data".format(
    len(shapiro_failed_countries), spi_data["country"].nunique()))

21 countries out of 218 countries failed the Shapiro-Wilk test for normality of the SPI data


## Statistical Performance Indicator Index grouped by country

In [10]:
spi_mean_std = dict()
for name, group in spi_data.groupby(by="country"):
    spi_mean_std[name] = (round(group["SPI.INDEX"].mean(), 2), round(group["SPI.INDEX"].std(), 2), 
                         group["income"].iloc[0], group["iso3c"].iloc[0])

In [11]:
spi_mean_std_df = pd.DataFrame(data = [[country, value[0], value[1], value[2], value[3]] for country, value 
                                       in spi_mean_std.items()], columns=["country", "spi_mean", "spi_std", 
                                                                          "income", "iso3c"])

# x = np.arange(len(spi_mean_std))
# plt.figure(figsize=(20,20))
# plt.errorbar(x, mean_data, yerr=std_data, fmt=" ")

## Statistical Performance Indicator Index grouped by income

In [12]:
spi_mean_std_income_group = dict()
for name, group in spi_data.groupby(by="income"):
    spi_mean_std_income_group[name] = (round(group["SPI.INDEX"].mean(), 2), 
                                       round(group["SPI.INDEX"].std(), 2))

In [13]:
spi_mean_std_income_group

{'High income': (76.29, 13.54),
 'Low income': (46.21, 11.88),
 'Lower middle income': (54.71, 12.88),
 'Upper middle income': (59.44, 16.87)}

## Categorising countries into 3 categories based on SPI mean.

### Categories:
#### 1. High Income countries
#### 2. Countries with spi_mean greater than the median across low and middle income countries.
#### 3. Countries with spi_mean lower than the median across low and middle income countries.

In [14]:
def get_spi_category(row, threshold_for_cat):
    if row.income == "High income":
        return 1
    elif row.spi_mean >= threshold_for_cat:
        return 2
    else:
        return 3

In [15]:
threshold_spi = spi_mean_std_df[spi_mean_std_df["income"] != "High income"]["spi_mean"].median()
spi_mean_std_df["spi_category"] = spi_mean_std_df.apply(get_spi_category, axis=1, args=(threshold_spi, ))

### Method 1: Use the average value of SPI for each country(over the duration for which SPI is provided). 

In [72]:
# spi_mean_std_df.to_csv("./data/SPI/SPI_index_mean.csv", index=False)

### Method 2: Compute the distribution of SPI for each country and then impute values using it for years prior to 2004 and posterior to 2019 since SPI is only given for those years.

In [16]:
spi_mean_std_df[spi_mean_std_df["spi_ca"]]

Unnamed: 0,country,spi_mean,spi_std,income,iso3c,spi_category
0,Afghanistan,44.85,6.12,Low income,AFG,3
1,Albania,71.14,6.43,Upper middle income,ALB,2
2,Algeria,48.79,6.27,Lower middle income,DZA,3
3,American Samoa,,,Upper middle income,ASM,3
4,Andorra,,,High income,AND,1
...,...,...,...,...,...,...
213,Virgin Islands (U.S.),,,High income,VIR,1
214,West Bank and Gaza,67.16,5.47,Lower middle income,PSE,2
215,"Yemen, Rep.",39.09,2.40,Low income,YEM,3
216,Zambia,57.93,2.36,Lower middle income,ZMB,2


In [25]:
for income, group in spi_mean_std_df.groupby(by=["income"]):
    print(income)
#     for i in range(1, 4):
#         print(i, len(group[group["spi_category"]== i]))
              
#     print()

High income
1 83
2 0
3 0

Low income
1 0
2 5
3 24

Lower middle income
1 0
2 26
3 24

Upper middle income
1 0
2 31
3 25



In [26]:
list(spi_mean_std_df[spi_mean_std_df["spi_category"] == 1]["country"].unique())

['Andorra',
 'Antigua and Barbuda',
 'Aruba',
 'Australia',
 'Austria',
 'Bahamas, The',
 'Bahrain',
 'Barbados',
 'Belgium',
 'Bermuda',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Canada',
 'Cayman Islands',
 'Channel Islands',
 'Chile',
 'Croatia',
 'Curacao',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Estonia',
 'Faroe Islands',
 'Finland',
 'France',
 'French Polynesia',
 'Germany',
 'Gibraltar',
 'Greece',
 'Greenland',
 'Guam',
 'Hong Kong SAR, China',
 'Hungary',
 'Iceland',
 'Ireland',
 'Isle of Man',
 'Israel',
 'Italy',
 'Japan',
 'Korea, Rep.',
 'Kuwait',
 'Latvia',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Macao SAR, China',
 'Malta',
 'Mauritius',
 'Monaco',
 'Nauru',
 'Netherlands',
 'New Caledonia',
 'New Zealand',
 'Northern Mariana Islands',
 'Norway',
 'Oman',
 'Palau',
 'Panama',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Romania',
 'San Marino',
 'Saudi Arabia',
 'Seychelles',
 'Singapore',
 'Sint Maarten (Dutch part)',
 'Slovak Republic',
 