In [402]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Asylum Seeker dataset for EDA, Classification, Regression and Cluster Analysis

In [403]:
asylum_seekers = pd.read_csv('data/asylum_seekers.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [404]:
asylum_seekers.head()

Unnamed: 0,Year,Country / territory of asylum/residence,Origin,RSD procedure type / level,Tota pending start-year,of which UNHCR-assisted(start-year),Applied during year,decisions_recognized,decisions_other,Rejected,Otherwise closed,Total decisions,Total pending end-year,of which UNHCR-assisted(end-year)
0,2000,Zimbabwe,Afghanistan,G / FI,0,0,5,5,0,0,0,5.0,0,0
1,2000,South Africa,Afghanistan,G / FI,8,1,0,0,0,0,0,,8,0
2,2000,Uzbekistan,Afghanistan,U / FI,265,265,2156,747,0,112,327,1186.0,1235,1235
3,2000,United States of America,Afghanistan,G / EO,196,0,225,151,0,31,68,250.0,171,0
4,2000,United States of America,Afghanistan,G / IN,193,0,218,182,0,51,40,273.0,150,0


# Preprocessing

### 1. Renaming Columns

In [382]:
asylum_seekers.columns = ['year', 
          'country_of_residence', 
          'country_of_origin', 
          'rsd_type', 
          'total_pending_at_year_start', 
          'total_pending_year_start_UNHCR_assisted', 
          'total_applied_during_year',
          'decisions_recognized',
          'decisions_other', 
          'total_rejected',
          'decisions_closed',
          'total_decisions',
          'total_pending_at_year_end', 
          'total_pending_year_end_UNHCR_assisted']

In [383]:
asylum_seekers = asylum_seekers.drop(['decisions_other', 'total_decisions'], axis = 1)

In [384]:
asylum_seekers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129720 entries, 0 to 129719
Data columns (total 12 columns):
 #   Column                                   Non-Null Count   Dtype 
---  ------                                   --------------   ----- 
 0   year                                     129720 non-null  int64 
 1   country_of_residence                     129720 non-null  object
 2   country_of_origin                        129720 non-null  object
 3   rsd_type                                 129719 non-null  object
 4   total_pending_at_year_start              124578 non-null  object
 5   total_pending_year_start_UNHCR_assisted  118962 non-null  object
 6   total_applied_during_year                126222 non-null  object
 7   decisions_recognized                     119886 non-null  object
 8   total_rejected                           121615 non-null  object
 9   decisions_closed                         120850 non-null  object
 10  total_pending_at_year_end                125

### 2. Dealing with null values

In [385]:
# Replace all '*' with ''
asylum_seekers = asylum_seekers.replace(['*'], [0]) 

In [386]:
asylum_seekers.isna().sum()

year                                           0
country_of_residence                           0
country_of_origin                              0
rsd_type                                       1
total_pending_at_year_start                 5142
total_pending_year_start_UNHCR_assisted    10758
total_applied_during_year                   3498
decisions_recognized                        9834
total_rejected                              8105
decisions_closed                            8870
total_pending_at_year_end                   3888
total_pending_year_end_UNHCR_assisted       7837
dtype: int64

In [387]:
mode_rsd = asylum_seekers['rsd_type'].mode()
mode_rsd

0    G / FI
dtype: object

In [388]:
# rsd : refugee status determination type
asylum_seekers['rsd_type'] = asylum_seekers['rsd_type'].fillna(value = 'G / FI')

In [389]:
asylum_seekers['rsd_type'].unique()

array(['G / FI', 'U / FI', 'G / EO', 'G / IN', 'U / AR', 'G / AR',
       'G / JR', 'G / FA', 'G / TR', 'G / BL', 'G / RA', 'G / NA',
       'G / CA', 'U / FA', 'J / FI', 'G / fi', 'U / JR', 'G / ar',
       'J / AR', 'U / RA', 'J / FA', 'G / SP', 'J / RA', 'G / TP',
       'G / TA', 'U / NA'], dtype=object)

In [390]:
asylum_seekers['rsd_type']

0         G / FI
1         G / FI
2         U / FI
3         G / EO
4         G / IN
           ...  
129715    G / IN
129716    G / EO
129717    G / AR
129718    G / FI
129719    G / FI
Name: rsd_type, Length: 129720, dtype: object

In [391]:
def fill_na_0(column_name):
    asylum_seekers[column_name] = asylum_seekers[column_name].fillna(value = 0)

In [392]:
fill_na_0('total_pending_at_year_start')
fill_na_0('total_pending_year_start_UNHCR_assisted')
fill_na_0('total_pending_at_year_end')
fill_na_0('total_pending_year_end_UNHCR_assisted')
fill_na_0('total_applied_during_year')
fill_na_0('decisions_recognized')
fill_na_0('total_rejected')
fill_na_0('decisions_closed')

In [393]:
asylum_seekers.isna().sum()

year                                       0
country_of_residence                       0
country_of_origin                          0
rsd_type                                   0
total_pending_at_year_start                0
total_pending_year_start_UNHCR_assisted    0
total_applied_during_year                  0
decisions_recognized                       0
total_rejected                             0
decisions_closed                           0
total_pending_at_year_end                  0
total_pending_year_end_UNHCR_assisted      0
dtype: int64

### 3. Changing data type and column values

In [394]:
for col in ['total_pending_at_year_start', \
            'total_pending_year_start_UNHCR_assisted', \
            'total_applied_during_year', \
            'decisions_recognized', \
            'total_rejected', \
            'decisions_closed', \
            'total_pending_at_year_end', \
            'total_pending_year_end_UNHCR_assisted']:
    asylum_seekers[col] = pd.to_numeric(asylum_seekers[col], errors='coerce')
    asylum_seekers[col] = asylum_seekers[col].astype('float')

In [395]:
asylum_seekers['rsd_type'] = [[x[:1], x[4:]] for x in asylum_seekers['rsd_type']]

In [396]:
def change_country_names(column_name):
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['United States of America'],['USA'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['Syrian Arab Rep.'],['Syria'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['Serbia and Kosovo (S/RES/1244 (1999))'],['Serbia/Kosovo'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['Venezuela (Bolivarian Republic of)'],['Venezuela'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['Iran (Islamic Rep. of)'],['Iran'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['The former Yugoslav Republic of Macedonia', 'The former Yugoslav Rep. of Macedonia'],['Macedonia', 'Macedonia'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['United Arab Emirates'],['UAE'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['Dem. Rep. of the Congo'],['Congo'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['Bolivia (Plurinational State of)'],['Bolivia'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['United Kingdom', 'United Kingdom of Great Britain and Northern Ireland'],['UK', 'UK'])
    asylum_seekers[column_name] = asylum_seekers[column_name].replace(['Micronesia (Federated States of)'],['Micronesia'])

change_country_names('country_of_residence')
change_country_names('country_of_origin')

### 4. Combining features and creating feature

In [397]:
asylum_seekers['total_pending_year_start'] = asylum_seekers['total_pending_at_year_start'] + asylum_seekers['total_pending_year_start_UNHCR_assisted']

In [398]:
asylum_seekers['total_pending_year_end'] = asylum_seekers['total_pending_at_year_end'] + asylum_seekers['total_pending_year_end_UNHCR_assisted']

In [399]:
asylum_seekers = asylum_seekers.drop(['total_pending_at_year_start', 'total_pending_at_year_end',
                                      'total_pending_year_end_UNHCR_assisted', 'total_pending_year_start_UNHCR_assisted'],
                                    axis = 1)

In [400]:
def check_rejected(x):
    if x == 0:
        return 'N'
    else:
        return 'Y'

asylum_seekers['Rejected(Y/N)'] =  asylum_seekers['total_rejected'].apply(check_rejected)

In [401]:
asylum_seekers.head()

Unnamed: 0,year,country_of_residence,country_of_origin,rsd_type,total_applied_during_year,decisions_recognized,total_rejected,decisions_closed,total_pending_year_start,total_pending_year_end,Rejected(Y/N)
0,2000,Zimbabwe,Afghanistan,"[G, FI]",5.0,5.0,0.0,0.0,0.0,0.0,N
1,2000,South Africa,Afghanistan,"[G, FI]",0.0,0.0,0.0,0.0,9.0,8.0,N
2,2000,Uzbekistan,Afghanistan,"[U, FI]",2156.0,747.0,112.0,327.0,530.0,2470.0,Y
3,2000,USA,Afghanistan,"[G, EO]",225.0,151.0,31.0,68.0,196.0,171.0,Y
4,2000,USA,Afghanistan,"[G, IN]",218.0,182.0,51.0,40.0,193.0,150.0,Y


# Datasets for EDA

In [365]:
asylum_seekers_monthly = pd.read_csv('data/asylum_seekers_monthly.csv')
asylum_seekers_monthly.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Country / territory of asylum/residence,Origin,Year,Month,Value
0,Australia,Afghanistan,1999,January,8
1,Australia,Afghanistan,1999,February,10
2,Australia,Afghanistan,1999,March,25
3,Australia,Afghanistan,1999,April,25
4,Australia,Afghanistan,1999,May,7


In [366]:
asylum_seekers_monthly.columns = [ 'country_of_residence', 
          'country_of_origin', 
          'year_applied', 
          'month_applied', 
          'total_asylum_seekers']

In [367]:
asylum_seekers_monthly.isna().sum()

country_of_residence    0
country_of_origin       0
year_applied            0
month_applied           0
total_asylum_seekers    0
dtype: int64

In [371]:
def change_country_names_2(column_name):
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['USA (INS/DHS)'],['USA'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['United States of America'],['USA'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['USA (EOIR)'],['USA'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['Syrian Arab Rep.'],['Syria'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['Serbia and Kosovo: S/RES/1244 (1999)'],['Serbia/Kosovo'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['Venezuela (Bolivarian Republic of)'],['Venezuela'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['Iran (Islamic Rep. of)'],['Iran'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['The former Yugoslav Rep. of Macedonia'],['Macedonia'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['United Arab Emirates'],['UAE'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['Dem. Rep. of the Congo'],['Congo'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['Bolivia (Plurinational State of)'],['Bolivia'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['United Kingdom of Great Britain and Northern Ireland'],['UK'])
    asylum_seekers_monthly[column_name] = asylum_seekers_monthly[column_name].replace(['Micronesia (Federated States of)'],['Micronesia'])
change_country_names_2('country_of_residence')
change_country_names_2('country_of_origin')

In [376]:
asylum_seekers_monthly.tail()

Unnamed: 0,country_of_residence,country_of_origin,year_applied,month_applied,total_asylum_seekers
332184,USA,Zimbabwe,2016,December,28
332185,USA,Zimbabwe,2017,February,27
332186,USA,Zimbabwe,2017,March,42
332187,USA,Zimbabwe,2017,April,16
332188,USA,Zimbabwe,2017,May,12


In [10]:
demographics = pd.read_csv('data/demographics.csv')
demographics.head()

Unnamed: 0,Year,Country / territory of asylum/residence,Location Name,Female 0-4,Female 5-11,Female 5-17,Female 12-17,Female 18-59,Female 60+,F: Unknown,F: Total,Male 0-4,Male 5-11,Male 5-17,Male 12-17,Male 18-59,Male 60+,M: Unknown,M: Total
0,2001,Afghanistan,Kabul,0,,1.0,,1,0,0.0,2,0,,0.0,,2,0,0.0,2
1,2001,Afghanistan,Various,14335,,45451.0,,99880,19234,412004.0,590904,14716,,47522.0,,114965,13025,435492.0,625720
2,2001,Afghanistan,Herat,0,,0.0,,1,0,0.0,1,0,,0.0,,1,0,0.0,1
3,2001,Angola,Viana,484,,1687.0,,1282,43,0.0,3496,597,,1645.0,,787,34,0.0,3063
4,2001,Angola,Moxico,219,,734.0,,427,25,0.0,1405,226,,711.0,,139,15,0.0,1091
