In [8]:
import pandas as pandas
import numpy as np

In [9]:
# load 'test.xlsx'

df_test = pandas.read_excel('test.xlsx', sheet_name='data', header=None)

df_test

Unnamed: 0,0,1,2
0,Community,Community Name,Braybrook (Suburb)
1,Community,Region,Northern and Western Metropolitan
2,Geography,Map reference,4
3,Geography,Grid reference,A3
4,Geography,Location,10km WNW of Melbourne
...,...,...,...
221,Hospital,Distance to nearest public hospital with emerg...,10.161988
222,Hospital,Presentations to emergency departments due to ...,543.631989
223,Hospital,Presentations to emergency departments due to ...,20.647263
224,Hospital,Category 4 & 5 emergency department presentations,1683.966712


In [10]:
# set the first column as reference
ref_col = df_test.iloc[:, 0]

In [11]:
def excel_loader(xlsx_path, sheet_name='data', ref_col=ref_col):
    # load the excel file
    df = pandas.read_excel(xlsx_path, sheet_name=sheet_name, header=None)
    # replace the first column with the reference column
    df.iloc[:, 0] = ref_col
    # change the column names
    df.columns = ['feature_kind', 'feature_name', 'feature_value']
    return df



In [12]:
excel_loader('data/Ascot-Vale-Suburb - XLSX.xlsx')

Unnamed: 0,feature_kind,feature_name,feature_value
0,Community,Community Name,Ascot Vale (Suburb)
1,Community,Region,Northern and Western Metropolitan
2,Geography,Map reference,4
3,Geography,Grid reference,B3
4,Geography,Location,6km NW of Melbourne
...,...,...,...
221,Hospital,Distance to nearest public hospital with emerg...,4.993841
222,Hospital,Presentations to emergency departments due to ...,679.257076
223,Hospital,Presentations to emergency departments due to ...,20.502456
224,Hospital,Category 4 & 5 emergency department presentations,1864.918123


In [13]:
import os

dataset_list=['data/'+f for f in os.listdir('data') if f.endswith('.xlsx')]
df_all=[excel_loader(f) for f in dataset_list]
df_all

[    feature_kind                                       feature_name  \
 0      Community                                     Community Name   
 1      Community                                             Region   
 2      Geography                                      Map reference   
 3      Geography                                     Grid reference   
 4      Geography                                           Location   
 ..           ...                                                ...   
 221     Hospital  Distance to nearest public hospital with emerg...   
 222     Hospital  Presentations to emergency departments due to ...   
 223     Hospital  Presentations to emergency departments due to ...   
 224     Hospital  Category 4 & 5 emergency department presentations   
 225     Hospital  Category 4 & 5 emergency department presentati...   
 
                          feature_value  
 0                  Ascot Vale (Suburb)  
 1    Northern and Western Metropolitan  
 2      

In [15]:
# count the columns for each dataset
df_all_count = [df.shape[0] for df in df_all]
# check if all the datasets have the same number of columns
all(x == df_all_count[0] for x in df_all_count)

True

In [29]:
# check for missing values in any of the datasets
missing_values = [df.isnull().values.any() for df in df_all]
# print the index of the dataset with missing values
dfs_with_na=[i for i, x in enumerate(missing_values) if x]
dfs_with_na

[8, 9, 22, 26, 29, 30, 31, 32]

In [30]:
for i in dfs_with_na:
    print(dataset_list[i])

data/Malvern-Suburb - XLSX.xlsx
data/Melbourne-Airport-Suburb - XLSX.xlsx
data/Sorrento-Suburb - XLSX.xlsx
data/St-Andrews-Beach-Suburb - XLSX.xlsx
data/St-Kilda-West-Suburb - XLSX.xlsx
data/Toorak-Suburb - XLSX.xlsx
data/Tyabb-Suburb - XLSX.xlsx
data/Waterways-Suburb - XLSX.xlsx


In [31]:
# write utility function to get the index and the feature_category, feature_name, feature_value of the missing values for a given dataset

def get_missing_values(df):
    # get the index of the missing values
    missing_values_index = df[df.isnull().any(axis=1)].index
    # get the feature_category, feature_name, feature_value of the missing values
    missing_values = df.iloc[missing_values_index]
    return missing_values

In [38]:
affected_details = [get_missing_values(df_all[i]) for i in dfs_with_na]

In [41]:
affected_rows_count=[df.shape[0] for df in affected_details]
affected_rows_count

[1, 52, 5, 8, 1, 1, 1, 5]

In [44]:
for i in range(len(dfs_with_na)):
    # print(dataset_list[dfs_with_na[i]])
    if i!=1: print(affected_details[i])

    feature_kind                             feature_name feature_value
171    Diversity  Aboriginal or Torres Strait Islander, %           NaN
          feature_kind                             feature_name feature_value
120  Socio-demographic     % dwellings which are public housing           NaN
133  Socio-demographic      Male-headed lone parent families, %           NaN
171          Diversity  Aboriginal or Torres Strait Islander, %           NaN
179          Diversity              Poor English proficiency, %           NaN
209          Diversity               5th top language spoken, %           NaN
          feature_kind                             feature_name feature_value
120  Socio-demographic     % dwellings which are public housing           NaN
122  Socio-demographic       Dwellings with no motor vehicle, %           NaN
153  Socio-demographic              Aged 75+ and lives alone, %           NaN
171          Diversity  Aboriginal or Torres Strait Islander, %           Na

In [50]:
# figure out which feature category is missing across the affected datasets
missing_feature_categories = [df['feature_kind'].unique() for df in affected_details]
missing_feature_names = [df['feature_name'].unique() for df in affected_details]

missing_feature_names[:1]+missing_feature_categories[2:]

[array(['Aboriginal or Torres Strait Islander, %'], dtype=object),
 array(['Socio-demographic', 'Diversity'], dtype=object),
 array(['Socio-demographic', 'Diversity'], dtype=object),
 array(['Diversity'], dtype=object),
 array(['Socio-demographic'], dtype=object),
 array(['Socio-demographic'], dtype=object),
 array(['2007-2012 population change', 'Socio-demographic', 'Diversity'],
       dtype=object)]