# Unemployment Data Cleaning
---
Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_ue = pd.read_csv('../../data/Unemployment API_SL.UEM.TOTL.ZS_DS2_en_csv_v2_3731354/API_SL-Copy1.UEM.TOTL.ZS_DS2_en_csv_v2_3731354.csv')

In [3]:
df_ue.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2016,2017,2018,2019,2020
0,Aruba,ABW,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,
1,Africa Eastern and Southern,AFE,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,6.610205,6.714955,6.731163,6.914353,7.563187
2,Afghanistan,AFG,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,11.158,11.18,11.152,11.217,11.71
3,Africa Western and Central,AFW,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,5.567017,6.019505,6.041092,6.063362,6.774914
4,Angola,AGO,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,7.412,7.408,7.421,7.421,8.333


In [4]:
df_ue['Indicator Code'].value_counts()

SL.UEM.TOTL.ZS    266
Name: Indicator Code, dtype: int64

In [5]:
df_ue.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)

## Set the index columns

In [6]:
df_ue.set_index(["Country Name","Country Code"], inplace=True)
df_ue.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,2016,2017,2018,2019,2020
Country Name,Country Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aruba,ABW,,,,,
Africa Eastern and Southern,AFE,6.610205,6.714955,6.731163,6.914353,7.563187
Afghanistan,AFG,11.158,11.18,11.152,11.217,11.71
Africa Western and Central,AFW,5.567017,6.019505,6.041092,6.063362,6.774914
Angola,AGO,7.412,7.408,7.421,7.421,8.333


## Use unstack() and stack() to pivot those that are not index columns, and create a new df

In [7]:
new_ue = df_ue.unstack().stack(level=0)
new_ue.head(10)

Unnamed: 0_level_0,Country Code,ABW,AFE,AFG,AFW,AGO,ALB,AND,ARB,ARE,ARG,...,VIR,VNM,VUT,WLD,WSM,XKX,YEM,ZAF,ZMB,ZWE
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,2016,,,11.158,,,,,,,,...,,,,,,,,,,
Afghanistan,2017,,,11.18,,,,,,,,...,,,,,,,,,,
Afghanistan,2018,,,11.152,,,,,,,,...,,,,,,,,,,
Afghanistan,2019,,,11.217,,,,,,,,...,,,,,,,,,,
Afghanistan,2020,,,11.71,,,,,,,,...,,,,,,,,,,
Africa Eastern and Southern,2016,,6.610205,,,,,,,,,...,,,,,,,,,,
Africa Eastern and Southern,2017,,6.714955,,,,,,,,,...,,,,,,,,,,
Africa Eastern and Southern,2018,,6.731163,,,,,,,,,...,,,,,,,,,,
Africa Eastern and Southern,2019,,6.914353,,,,,,,,,...,,,,,,,,,,
Africa Eastern and Southern,2020,,7.563187,,,,,,,,,...,,,,,,,,,,


## Reset the index

In [8]:
new_ue.reset_index(inplace=True)
new_ue.head()

Country Code,Country Name,level_1,ABW,AFE,AFG,AFW,AGO,ALB,AND,ARB,...,VIR,VNM,VUT,WLD,WSM,XKX,YEM,ZAF,ZMB,ZWE
0,Afghanistan,2016,,,11.158,,,,,,...,,,,,,,,,,
1,Afghanistan,2017,,,11.18,,,,,,...,,,,,,,,,,
2,Afghanistan,2018,,,11.152,,,,,,...,,,,,,,,,,
3,Afghanistan,2019,,,11.217,,,,,,...,,,,,,,,,,
4,Afghanistan,2020,,,11.71,,,,,,...,,,,,,,,,,


In [9]:
new_ue.shape

(1175, 268)

## Rename the Year column

In [10]:
new_ue.rename(columns={'level_1': 'Year'}, inplace=True)

In [11]:
new_ue.head()

Country Code,Country Name,Year,ABW,AFE,AFG,AFW,AGO,ALB,AND,ARB,...,VIR,VNM,VUT,WLD,WSM,XKX,YEM,ZAF,ZMB,ZWE
0,Afghanistan,2016,,,11.158,,,,,,...,,,,,,,,,,
1,Afghanistan,2017,,,11.18,,,,,,...,,,,,,,,,,
2,Afghanistan,2018,,,11.152,,,,,,...,,,,,,,,,,
3,Afghanistan,2019,,,11.217,,,,,,...,,,,,,,,,,
4,Afghanistan,2020,,,11.71,,,,,,...,,,,,,,,,,


## Fill the new null values

In [13]:
new_ue.fillna(0)

Country Code,Country Name,Year,ABW,AFE,AFG,AFW,AGO,ALB,AND,ARB,...,VIR,VNM,VUT,WLD,WSM,XKX,YEM,ZAF,ZMB,ZWE
0,Afghanistan,2016,0.0,0.0,11.158,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000
1,Afghanistan,2017,0.0,0.0,11.180,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000
2,Afghanistan,2018,0.0,0.0,11.152,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000
3,Afghanistan,2019,0.0,0.0,11.217,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000
4,Afghanistan,2020,0.0,0.0,11.710,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170,Zimbabwe,2016,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.788
1171,Zimbabwe,2017,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.785
1172,Zimbabwe,2018,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.796
1173,Zimbabwe,2019,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.833


## Sum Across the row (those with the annual GDP data)

In [14]:
new_ue['Total_Unemployment']= new_ue.iloc[:, -265:-1].sum(axis=1)
new_ue.head()

Country Code,Country Name,Year,ABW,AFE,AFG,AFW,AGO,ALB,AND,ARB,...,VNM,VUT,WLD,WSM,XKX,YEM,ZAF,ZMB,ZWE,Total_Unemployment
0,Afghanistan,2016,,,11.158,,,,,,...,,,,,,,,,,11.158
1,Afghanistan,2017,,,11.18,,,,,,...,,,,,,,,,,11.18
2,Afghanistan,2018,,,11.152,,,,,,...,,,,,,,,,,11.152
3,Afghanistan,2019,,,11.217,,,,,,...,,,,,,,,,,11.217
4,Afghanistan,2020,,,11.71,,,,,,...,,,,,,,,,,11.71


## Drop the unneccessary columns

In [16]:
new_ue = new_ue[['Country Name', 'Year', 'Total_Unemployment']]
new_ue.head(10)

Country Code,Country Name,Year,Total_Unemployment
0,Afghanistan,2016,11.158
1,Afghanistan,2017,11.18
2,Afghanistan,2018,11.152
3,Afghanistan,2019,11.217
4,Afghanistan,2020,11.71
5,Africa Eastern and Southern,2016,6.610205
6,Africa Eastern and Southern,2017,6.714955
7,Africa Eastern and Southern,2018,6.731163
8,Africa Eastern and Southern,2019,6.914353
9,Africa Eastern and Southern,2020,7.563187


In [17]:
new_ue.rename(columns = {'Country Name': 'Country of asylum'}, inplace=True)
new_ue.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Country Code,Country of asylum,Year,Total_Unemployment
0,Afghanistan,2016,11.158
1,Afghanistan,2017,11.18
2,Afghanistan,2018,11.152
3,Afghanistan,2019,11.217
4,Afghanistan,2020,11.71


## Change the 'Year' from an object to int

In [18]:
new_ue.dtypes

Country Code
Country of asylum      object
Year                   object
Total_Unemployment    float64
dtype: object

In [19]:
new_ue['Year']=new_ue['Year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_ue['Year']=new_ue['Year'].astype(int)


In [20]:
new_ue.dtypes

Country Code
Country of asylum      object
Year                    int64
Total_Unemployment    float64
dtype: object

In [21]:
new_ue.shape

(1175, 3)

## Save file as cleaned unemployment

In [24]:
new_ue.to_csv('../../data/Unemployment API_SL.UEM.TOTL.ZS_DS2_en_csv_v2_3731354/cleaned_ue.csv', index=True)