# Test file for Ukrainian Asylym Countries
---
Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../../data/data_final.csv')

In [3]:
df.head()

Unnamed: 0,Year,Country,Refugees under UNHCR's mandate,Asylum-seekers,IDPs of concern to UNHCR,Stateless persons,Others of concern,Ref and Asyl,SUM REFUGEE,GDP_annual_change,...,GNI growth (annual %),"International tourism, expenditures (current US$)","International tourism, receipts (current US$)",Military expenditure (current USD),Population growth (annual %),Prevalence of undernourishment (% of population),Refugee population by country or territory of asylum,Strength of legal rights index (0=weak to 12=strong),"Unemployment, total (% of total labor force) (modeled ILO estimate)",Net official flows from UN agencies: Total
0,2016,El Salvador,41.0,0.0,0.0,0.0,9800.0,41.0,9841.0,2.545926,...,1.962239,451000000.0,1161000000.0,247600000.0,0.489164,9.9,41.0,9.0,4.42,6561136.0
1,2018,El Salvador,44.0,16.0,71501.0,0.0,4700.0,60.0,76261.0,2.432288,...,2.420301,490000000.0,1370000000.0,294610000.0,0.509273,8.6,44.0,9.0,4.01,6498476.0
2,2016,Mexico,6178.0,2636.0,0.0,13.0,0.0,8814.0,8827.0,2.630532,...,2.526165,12823000000.0,20619000000.0,5336876000.0,1.20326,5.8,6178.0,10.0,3.86,4939827.0
3,2016,Colombia,241.0,368.0,7410816.0,11.0,0.0,609.0,7411436.0,2.087383,...,2.422212,4891000000.0,5631000000.0,8675981000.0,1.36765,6.8,241.0,11.0,8.69,12837930.0
4,2018,Colombia,294.0,2851.0,7816473.0,11.0,0.0,3145.0,7819629.0,2.564324,...,1.748331,5531000000.0,6655000000.0,10134720000.0,1.524236,7.9,294.0,11.0,9.11,27781000.0


In [4]:
cols_to_mean = ['Adjusted savings: net national savings (current US$)',
       'Adjusted savings: particulate emission damage (current US$)',
       'Air transport, passengers carried',
       'Current health expenditure (% of GDP)',
       'Current health expenditure per capita (current US$)',
       'Death rate, crude (per 1,000 people)',
       'Ease of doing business score (0 = lowest performance to 100 = best performance)',
       'GNI growth (annual %)',
       'International tourism, receipts (current US$)',
       'Military expenditure (current USD)', 'Population growth (annual %)',
       'Strength of legal rights index (0=weak to 12=strong)',
       'Unemployment, total (% of total labor force) (modeled ILO estimate)']

cols_to_median = [
       'Adolescent fertility rate (births per 1,000 women ages 15-19)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure per capita (current US$)',
       'Fixed broadband subscriptions (per 100 people)',
       'Fixed telephone subscriptions (per 100 people)',
       'International tourism, expenditures (current US$)',
       'Prevalence of undernourishment (% of population)',
       'Refugee population by country or territory of asylum',
       'Net official flows from UN agencies: Total']

## Read in Ukraine Refugee Numbers Data
---
Data obtained from UNHCR ([*source*](https://data2.unhcr.org/en/situations/ukraine)). Using the original data frame for countries with years 2016-2019, we imputed data for 2022. Looking at distributions of the data, median was imputed for data that was skewed and mean was imputed for data that was approximately normally distributed.

In [5]:
dfu = pd.read_csv('../../data/Ukraine_Refugee_Numbers.csv')

In [6]:
preddf = pd.DataFrame()

In [7]:
preddf['Year'] = dfu['Year']
preddf['Country'] = dfu['Country']
preddf['Refugees under UNHCR\'s mandate']=dfu['Refugees under UNHCR\'s mandate']
preddf['Asylum-seekers']=dfu['Asylum-seekers']
preddf['IDPs of concern to UNHCR']=dfu['IDPs of concern to UNHCR']
preddf['Stateless persons']=dfu['Stateless persons']
preddf['Others of concern']=dfu['Others of concern']
preddf['Ref and Asyl']=dfu['Ref and Asyl']
preddf['SUM REFUGEE']=dfu['SUM REFUGEE']

## Merging Data from original data frame by country and Imputing Mean or Median

In [8]:
for col in cols_to_mean:
    
    preddf = preddf.merge( df.groupby('Country')[[col]].mean(), how='left', on='Country')
    
    
preddf.head()

Unnamed: 0,Year,Country,Refugees under UNHCR's mandate,Asylum-seekers,IDPs of concern to UNHCR,Stateless persons,Others of concern,Ref and Asyl,SUM REFUGEE,Adjusted savings: net national savings (current US$),...,Current health expenditure (% of GDP),Current health expenditure per capita (current US$),"Death rate, crude (per 1,000 people)",Ease of doing business score (0 = lowest performance to 100 = best performance),GNI growth (annual %),"International tourism, receipts (current US$)",Military expenditure (current USD),Population growth (annual %),Strength of legal rights index (0=weak to 12=strong),"Unemployment, total (% of total labor force) (modeled ILO estimate)"
0,2022,Poland,2083854,0,0,0,0,2083854,2083854,44161680000.0,...,6.467334,928.956207,10.625,77.213015,4.37274,14282000000.0,10715420000.0,-0.013783,7.0,4.545
1,2022,Romania,535461,0,0,0,0,535461,535461,6591602000.0,...,5.364822,614.267609,13.375,72.934697,5.052264,3536500000.0,3809572000.0,-0.566494,9.0,4.7325
2,2022,Republic of Moldova,365197,0,0,0,0,365197,365197,,...,,,,,,,,,,
3,2022,Hungary,312120,0,0,0,0,312120,312120,15378010000.0,...,6.661884,1009.101074,13.3,72.680193,4.420212,8943000000.0,1648573000.0,-0.183254,9.0,4.1
4,2022,Slovakia,250036,0,0,0,0,250036,250036,4725066000.0,...,6.886357,1251.663879,9.825,75.30321,2.573039,3126000000.0,1287655000.0,0.139484,7.0,7.5225


In [9]:
for col in cols_to_median:
    
    preddf = preddf.merge( df.groupby('Country')[[col]].median(), how='left', on='Country')
    
    
preddf

Unnamed: 0,Year,Country,Refugees under UNHCR's mandate,Asylum-seekers,IDPs of concern to UNHCR,Stateless persons,Others of concern,Ref and Asyl,SUM REFUGEE,Adjusted savings: net national savings (current US$),...,"Unemployment, total (% of total labor force) (modeled ILO estimate)","Adolescent fertility rate (births per 1,000 women ages 15-19)",Domestic general government health expenditure per capita (current US$),Domestic private health expenditure per capita (current US$),Fixed broadband subscriptions (per 100 people),Fixed telephone subscriptions (per 100 people),"International tourism, expenditures (current US$)",Prevalence of undernourishment (% of population),Refugee population by country or territory of asylum,Net official flows from UN agencies: Total
0,2022,Poland,2083854,0,0,0,0,2083854,2083854,44161680000.0,...,4.545,10.3536,660.872889,282.860393,20.325415,18.771959,9790500000.0,2.5,12360.0,0.0
1,2022,Romania,535461,0,0,0,0,535461,535461,6591602000.0,...,4.7325,35.873,491.920694,129.212942,25.143476,19.277894,5532000000.0,2.5,3885.5,0.0
2,2022,Republic of Moldova,365197,0,0,0,0,365197,365197,,...,,,,,,,,,,
3,2022,Hungary,312120,0,0,0,0,312120,312120,15378010000.0,...,4.1,24.1273,697.982931,319.514078,31.055118,31.856395,3144000000.0,2.5,5710.5,0.0
4,2022,Slovakia,250036,0,0,0,0,250036,250036,4725066000.0,...,7.5225,25.8096,984.86627,259.688334,26.719789,13.591184,2705000000.0,4.3,951.5,0.0
5,2022,Russian Federation,231764,0,0,0,0,231764,231764,222750000000.0,...,5.03,20.1593,346.5118,247.828844,21.686631,21.307699,37187500000.0,2.5,101701.5,0.0
6,2022,Belarus,3765,0,0,0,0,3765,3765,5978325000.0,...,5.1025,14.1293,239.094725,102.188127,33.671118,47.551795,1126000000.0,2.5,2190.0,3236841.0


### Drop Row

Republic of Moldova does not have data, so we will remove that row from the dataset for testing.

In [10]:
preddf.drop(index = 2, inplace = True)

In [11]:
preddf.reset_index()

Unnamed: 0,index,Year,Country,Refugees under UNHCR's mandate,Asylum-seekers,IDPs of concern to UNHCR,Stateless persons,Others of concern,Ref and Asyl,SUM REFUGEE,...,"Unemployment, total (% of total labor force) (modeled ILO estimate)","Adolescent fertility rate (births per 1,000 women ages 15-19)",Domestic general government health expenditure per capita (current US$),Domestic private health expenditure per capita (current US$),Fixed broadband subscriptions (per 100 people),Fixed telephone subscriptions (per 100 people),"International tourism, expenditures (current US$)",Prevalence of undernourishment (% of population),Refugee population by country or territory of asylum,Net official flows from UN agencies: Total
0,0,2022,Poland,2083854,0,0,0,0,2083854,2083854,...,4.545,10.3536,660.872889,282.860393,20.325415,18.771959,9790500000.0,2.5,12360.0,0.0
1,1,2022,Romania,535461,0,0,0,0,535461,535461,...,4.7325,35.873,491.920694,129.212942,25.143476,19.277894,5532000000.0,2.5,3885.5,0.0
2,3,2022,Hungary,312120,0,0,0,0,312120,312120,...,4.1,24.1273,697.982931,319.514078,31.055118,31.856395,3144000000.0,2.5,5710.5,0.0
3,4,2022,Slovakia,250036,0,0,0,0,250036,250036,...,7.5225,25.8096,984.86627,259.688334,26.719789,13.591184,2705000000.0,4.3,951.5,0.0
4,5,2022,Russian Federation,231764,0,0,0,0,231764,231764,...,5.03,20.1593,346.5118,247.828844,21.686631,21.307699,37187500000.0,2.5,101701.5,0.0
5,6,2022,Belarus,3765,0,0,0,0,3765,3765,...,5.1025,14.1293,239.094725,102.188127,33.671118,47.551795,1126000000.0,2.5,2190.0,3236841.0


## Feature Engineering

In [13]:
preddf['Intl tourism expenditures recipts and asylum seekers'] = preddf['International tourism, expenditures (current US$)']*preddf['International tourism, receipts (current US$)']* preddf['Asylum-seekers']
preddf['Intl tourism expenditures receipts and military'] = preddf['International tourism, expenditures (current US$)']*preddf['International tourism, receipts (current US$)']*preddf['Military expenditure (current USD)']
preddf['net inflows UN and refugee pop'] = preddf['Net official flows from UN agencies: Total']*preddf['Refugee population by country or territory of asylum']
preddf['SUM REFUGEE and undernourishment'] = preddf['SUM REFUGEE'] * preddf['Prevalence of undernourishment (% of population)']

In [14]:
preddf.to_csv('../../data/to_predict.csv')