## Cleaning demographics.csv dataset


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [9]:
demographics_df = pd.read_csv('../Stats Dataset Uncleaned/demographics.csv')
demographics_df.head()

Unnamed: 0,Year,Territory of asylum or residence,Location Name,Female 0-4,Female 5-11,Female 5-17,Female 12-17,Female 18-59,Female 60+,F: Unknown,F: Total,Male 0-4,Male 5-11,Male 5-17,Male 12-17,Male 18-59,Male 60+,M: Unknown,M: Total
0,2001,Afghanistan,Kabul,0,,1.0,,1,0,0.0,2,0,,0.0,,2,0,0.0,2
1,2001,Afghanistan,Various,14335,,45451.0,,99880,19234,412004.0,590904,14716,,47522.0,,114965,13025,435492.0,625720
2,2001,Afghanistan,Herat,0,,0.0,,1,0,0.0,1,0,,0.0,,1,0,0.0,1
3,2001,Angola,Viana,484,,1687.0,,1282,43,0.0,3496,597,,1645.0,,787,34,0.0,3063
4,2001,Angola,Moxico,219,,734.0,,427,25,0.0,1405,226,,711.0,,139,15,0.0,1091


In [10]:
demographics_df.isnull().sum()

Year                                    0
Territory of asylum or residence        0
Location Name                           1
Female 0-4                           1273
Female 5-11                          4429
Female 5-17                         15160
Female 12-17                         4407
Female 18-59                          662
Female 60+                           1456
F: Unknown                           5863
F: Total                                0
Male 0-4                             1281
Male 5-11                            4406
Male 5-17                           15160
Male 12-17                           4370
Male 18-59                            436
Male 60+                             1370
M: Unknown                           5856
M: Total                                0
dtype: int64

In [11]:
demographics_df.columns

Index(['Year', 'Territory of asylum or residence', 'Location Name',
       'Female 0-4', 'Female 5-11', 'Female 5-17', 'Female 12-17',
       'Female 18-59', 'Female 60+', 'F: Unknown', 'F: Total', 'Male 0-4',
       'Male 5-11', 'Male 5-17', 'Male 12-17', 'Male 18-59', 'Male 60+',
       'M: Unknown', 'M: Total'],
      dtype='object')

In [17]:
### Finding the percentage of missing values in the data

features_with_na = [features for features in demographics_df.columns if demographics_df[features].isnull().sum()>=1]
for feature in features_with_na:
    print(feature,np.round(demographics_df[feature].isnull().mean()*100,5)), "% missing values."

Location Name 0.00545
Female 0-4 6.93506
Female 5-11 24.12835
Female 5-17 82.5888
Female 12-17 24.0085
Female 18-59 3.60645
Female 60+ 7.93201
F: Unknown 31.94051
Male 0-4 6.97864
Male 5-11 24.00305
Male 5-17 82.5888
Male 12-17 23.80693
Male 18-59 2.37525
Male 60+ 7.4635
M: Unknown 31.90238


### Handling the missing values

In [23]:
# Droppedthe column where the location name is null

demographics_df = demographics_df.dropna(subset=['Location Name'])

In [24]:
demographics_df.isnull().sum()

Year                                    0
Territory of asylum or residence        0
Location Name                           0
Female 0-4                           1273
Female 5-11                          4429
Female 5-17                         15159
Female 12-17                         4407
Female 18-59                          662
Female 60+                           1456
F: Unknown                           5863
F: Total                                0
Male 0-4                             1281
Male 5-11                            4406
Male 5-17                           15159
Male 12-17                           4370
Male 18-59                            436
Male 60+                             1370
M: Unknown                           5856
M: Total                                0
dtype: int64

In [31]:
demographics_df.dtypes

Year                                  int64
Territory of asylum or residence     object
Location Name                        object
Female 0-4                           object
Female 5-11                          object
Female 5-17                         float64
Female 12-17                         object
Female 18-59                         object
Female 60+                           object
F: Unknown                          float64
F: Total                             object
Male 0-4                             object
Male 5-11                            object
Male 5-17                           float64
Male 12-17                           object
Male 18-59                           object
Male 60+                             object
M: Unknown                          float64
M: Total                             object
dtype: object

In [32]:
# Convert all the numeric features to float

columns_to_convert = demographics_df.columns.difference(['Year', 'Territory of asylum or residence', 'Location Name', 'Female 5-17', 'F: Unknown','Male 5-17', 'M: Unknown'])

In [33]:
columns_to_convert

Index(['F: Total', 'Female 0-4', 'Female 12-17', 'Female 18-59', 'Female 5-11',
       'Female 60+', 'M: Total', 'Male 0-4', 'Male 12-17', 'Male 18-59',
       'Male 5-11', 'Male 60+'],
      dtype='object')

In [34]:
for column in columns_to_convert:
    demographics_df[column] = pd.to_numeric(demographics_df[column], errors='coerce').fillna(0.0).astype(float)

In [35]:
demographics_df.dtypes

Year                                  int64
Territory of asylum or residence     object
Location Name                        object
Female 0-4                          float64
Female 5-11                         float64
Female 5-17                         float64
Female 12-17                        float64
Female 18-59                        float64
Female 60+                          float64
F: Unknown                          float64
F: Total                            float64
Male 0-4                            float64
Male 5-11                           float64
Male 5-17                           float64
Male 12-17                          float64
Male 18-59                          float64
Male 60+                            float64
M: Unknown                          float64
M: Total                            float64
dtype: object

In [36]:
demographics_df['Female 0-4'].fillna(demographics_df['Female 0-4'].median(), inplace=True)
demographics_df['Female 5-11'].fillna(demographics_df['Female 5-11'].median(), inplace=True)
demographics_df['Female 5-17'].fillna(demographics_df['Female 5-17'].median(), inplace=True)
demographics_df['Female 12-17'].fillna(demographics_df['Female 12-17'].median(), inplace=True)
demographics_df['Female 18-59'].fillna(demographics_df['Female 18-59'].median(), inplace=True)
demographics_df['Female 60+'].fillna(demographics_df['Female 60+'].median(), inplace=True)
demographics_df['F: Unknown'].fillna(demographics_df['F: Unknown'].median(), inplace=True)

demographics_df['Male 0-4'].fillna(demographics_df['Male 0-4'].median(), inplace=True)
demographics_df['Male 5-11'].fillna(demographics_df['Male 5-11'].median(), inplace=True)
demographics_df['Male 5-17'].fillna(demographics_df['Male 5-17'].median(), inplace=True)
demographics_df['Male 12-17'].fillna(demographics_df['Male 12-17'].median(), inplace=True)
demographics_df['Male 18-59'].fillna(demographics_df['Male 18-59'].median(), inplace=True)
demographics_df['Male 60+'].fillna(demographics_df['Male 60+'].median(), inplace=True)
demographics_df['M: Unknown'].fillna(demographics_df['M: Unknown'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  demographics_df['Female 0-4'].fillna(demographics_df['Female 0-4'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  demographics_df['Female 5-11'].fillna(demographics_df['Female 5-11'].median(), inplace=True)
The behavior will change in pandas 3.0. This inp

In [37]:
demographics_df.isnull().sum()

Year                                0
Territory of asylum or residence    0
Location Name                       0
Female 0-4                          0
Female 5-11                         0
Female 5-17                         0
Female 12-17                        0
Female 18-59                        0
Female 60+                          0
F: Unknown                          0
F: Total                            0
Male 0-4                            0
Male 5-11                           0
Male 5-17                           0
Male 12-17                          0
Male 18-59                          0
Male 60+                            0
M: Unknown                          0
M: Total                            0
dtype: int64