In [1]:
#import required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt #visualisation 
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('asylum-applications.csv')

In [3]:
df.head()

Unnamed: 0,Year,Country of origin,Country of asylum,Country of origin (ISO),Country of asylum (ISO),Authority,Application type,Stage of procedure,Cases / Persons,applied
0,2018,Albania,Austria,ALB,AUT,G,N,FA,P,63
1,2018,Albania,Belgium,ALB,BEL,G,N,FI,P,505
2,2018,Albania,Belgium,ALB,BEL,G,R,RA,P,165
3,2018,Albania,Cyprus,ALB,CYP,G,N,FI,P,5
4,2018,Albania,Czechia,ALB,CZE,G,N,FI,P,5


In [4]:
df.tail()

Unnamed: 0,Year,Country of origin,Country of asylum,Country of origin (ISO),Country of asylum (ISO),Authority,Application type,Stage of procedure,Cases / Persons,applied
3493,2023,Ukraine,Sweden,UKR,SWE,G,A,AR,P,7
3494,2023,Ukraine,Sweden,UKR,SWE,G,N,FI,P,232
3495,2023,Ukraine,Sweden,UKR,SWE,G,R,RA,P,40
3496,2023,Ukraine,Switzerland,UKR,CHE,G,N,FI,P,17
3497,2023,Ukraine,Türkiye,UKR,TUR,G,N,FI,P,520


In [5]:
df.shape

(3498, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3498 entries, 0 to 3497
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Year                     3498 non-null   int64 
 1   Country of origin        3498 non-null   object
 2   Country of asylum        3498 non-null   object
 3   Country of origin (ISO)  3498 non-null   object
 4   Country of asylum (ISO)  3498 non-null   object
 5   Authority                3498 non-null   object
 6   Application type         3498 non-null   object
 7   Stage of procedure       3433 non-null   object
 8   Cases / Persons          3498 non-null   object
 9   applied                  3498 non-null   int64 
dtypes: int64(2), object(8)
memory usage: 273.4+ KB


In [7]:
df.describe()

Unnamed: 0,Year,applied
count,3498.0,3498.0
mean,2020.489423,305.375071
std,1.71722,1963.590716
min,2018.0,5.0
25%,2019.0,6.0
50%,2021.0,22.0
75%,2022.0,117.0
max,2023.0,100758.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3498 entries, 0 to 3497
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Year                     3498 non-null   int64 
 1   Country of origin        3498 non-null   object
 2   Country of asylum        3498 non-null   object
 3   Country of origin (ISO)  3498 non-null   object
 4   Country of asylum (ISO)  3498 non-null   object
 5   Authority                3498 non-null   object
 6   Application type         3498 non-null   object
 7   Stage of procedure       3433 non-null   object
 8   Cases / Persons          3498 non-null   object
 9   applied                  3498 non-null   int64 
dtypes: int64(2), object(8)
memory usage: 273.4+ KB


In [9]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (0, 10)


In [10]:
df = df.drop_duplicates()
df.head(10)

Unnamed: 0,Year,Country of origin,Country of asylum,Country of origin (ISO),Country of asylum (ISO),Authority,Application type,Stage of procedure,Cases / Persons,applied
0,2018,Albania,Austria,ALB,AUT,G,N,FA,P,63
1,2018,Albania,Belgium,ALB,BEL,G,N,FI,P,505
2,2018,Albania,Belgium,ALB,BEL,G,R,RA,P,165
3,2018,Albania,Cyprus,ALB,CYP,G,N,FI,P,5
4,2018,Albania,Czechia,ALB,CZE,G,N,FI,P,5
5,2018,Albania,Denmark,ALB,DNK,G,N,FI,P,78
6,2018,Albania,Finland,ALB,FIN,G,N,FI,P,36
7,2018,Albania,Finland,ALB,FIN,G,R,RA,P,5
8,2018,Albania,France,ALB,FRA,G,A,AR,P,4262
9,2018,Albania,France,ALB,FRA,G,N,FI,P,8311


In [11]:
df.tail()

Unnamed: 0,Year,Country of origin,Country of asylum,Country of origin (ISO),Country of asylum (ISO),Authority,Application type,Stage of procedure,Cases / Persons,applied
3493,2023,Ukraine,Sweden,UKR,SWE,G,A,AR,P,7
3494,2023,Ukraine,Sweden,UKR,SWE,G,N,FI,P,232
3495,2023,Ukraine,Sweden,UKR,SWE,G,R,RA,P,40
3496,2023,Ukraine,Switzerland,UKR,CHE,G,N,FI,P,17
3497,2023,Ukraine,Türkiye,UKR,TUR,G,N,FI,P,520


# Dealing with missing data in dataset

In [12]:
df.isnull().sum()

Year                        0
Country of origin           0
Country of asylum           0
Country of origin (ISO)     0
Country of asylum (ISO)     0
Authority                   0
Application type            0
Stage of procedure         65
Cases / Persons             0
applied                     0
dtype: int64

In [13]:
# a list with all missing value formats
missing_value_formats = ["n.a.","?","NA","n/a", "na","nan", "--"]
df = pd.read_csv("asylum-applications.csv", na_values = missing_value_formats)

In [14]:
df.isnull().sum()

Year                        0
Country of origin           0
Country of asylum           0
Country of origin (ISO)     0
Country of asylum (ISO)     0
Authority                   0
Application type            0
Stage of procedure         65
Cases / Persons             0
applied                     0
dtype: int64

In [22]:
df["Stage of procedure"].unique()

array(['FA', 'FI', 'RA', 'AR', 'JR', nan, 'TA'], dtype=object)

In [23]:
df = df.dropna(axis = 1, how ='all')

In [24]:
df = df.fillna(0)

In [25]:
df.isnull().sum()

Year                       0
Country of origin          0
Country of asylum          0
Country of origin (ISO)    0
Country of asylum (ISO)    0
Authority                  0
Application type           0
Stage of procedure         0
Cases / Persons            0
applied                    0
dtype: int64

# Some visuals of initial dataset