In [4]:
%reload_ext autoreload
%autoreload 2

import math
import sys
from pathlib import Path

import glob2
import numpy as np
import pandas as pd
from azureml.core import Dataset, Workspace
import matplotlib.pyplot as plt

sys.path.append(str(Path(os.getcwd()).parent))
from data_utilities import find_outlier_qrcodes, convert_age_from_days_to_years, extractqrcode, draw_sex_distribution, draw_age_distribution

### read the csv and total no. of children

In [5]:
DATASET_PATH = Path('/mnt/datasets/depthmap56k')

In [6]:
scans = pd.read_csv(DATASET_PATH / 'labels/scans.csv')
print("Total no. of children:", len(scans))
scans.head()

FileNotFoundError: [Errno 2] File b'/mnt/datasets/depthmap56k/labels/scans.csv' does not exist: b'/mnt/datasets/depthmap56k/labels/scans.csv'

### getting the total of gender

In [None]:
plt.figure()
draw_sex_distribution(scans)
plt.show()

In [None]:
#adding the Years column into dataframe
scans['Years'] = scans.apply(convert_age_from_days_to_years, axis=1)
scans.head()

### plotting the bar graph no. of children against age

In [None]:
plt.figure()
draw_age_distribution(scans)
plt.show()

### getting the distribition based on gender

In [None]:
male = scans[scans['sex']=='male']
print('No. of male distribution: ')
print(male['Years'].value_counts())

In [None]:
female = scans[scans['sex']=='female']
print('No. of female distribution: ')
print(female['Years'].value_counts())

### Reading the csv files and get the data at artifacts level

In [None]:
artifacts = pd.read_csv(DATASET_PATH / 'labels/artifacts.csv')
artifacts.head()

In [None]:
## dropping the null values from the artifacts 
artifacts  = artifacts.dropna()

In [None]:
print("Total no. of pointclouds:", len(artifacts ))

In [None]:
## keys for different scan type and their distribution in dataset
"- 100 - standing front scan",
"- 101 - standing 360 scan",
"- 102 - standing back scan",
"- 200 - lying front scan",
"- 201 - lying side scan",
"- 202 - lying back scan"
print(artifacts['key'].value_counts())
_ = artifacts['key'].value_counts().plot(kind='pie')

### plotting the height and weight distribution

In [None]:
artifacts['qrcode'] = artifacts.apply(extractqrcode, axis=1)
artifacts.head()

In [None]:
getheight = artifacts.drop_duplicates(subset=['qrcode']).reset_index(drop=True)
getheight.head()

In [None]:
print("Total no.unique scans:", len(getheight))

In [None]:
heightax = getheight['weight'].plot()
heightax.set_xlabel('scan')
heightax.set_ylabel('height')

In [None]:
weightax = getheight['weight'].plot()
weightax.set_xlabel('scan')
weightax.set_xlabel('weight')

In [None]:
distribution_data = pd.merge(scans,getheight[['height','weight','qrcode']],on='qrcode', how='left')
distribution_data.head()

In [None]:
#distribution of height against age
height_vs_age = distribution_data.plot.scatter(x='height', y='age', c='red')

In [None]:
#distribution of weight against age
weight_vs_age = distribution_data.plot.scatter(x='weight', y='age', c='red')

In [None]:
## distribution of height against weight
weight_vs_height = distribution_data.plot.scatter(x='height', y='weight', c='red')

## Analyze outliers: 1) age, 2) weight, 3) height

In [None]:
distribution_data.describe()

In [None]:
distribution_data.head()

### 1) Check age

##### Check age < 6 months or  > 6 years

In [None]:
find_outlier_qrcodes(distribution_data, 'age', '<365/2')

In [None]:
find_outlier_qrcodes(distribution_data, 'age', '>365*6')

### 2) Check weight

In [None]:
find_outlier_qrcodes(distribution_data, 'weight', '<5.0')

In [None]:
find_outlier_qrcodes(distribution_data, 'weight', '>30.0')

### 3) Check height

In [None]:
find_outlier_qrcodes(distribution_data, 'height', '<40.0')

In [None]:
find_outlier_qrcodes(artifacts, 'height', '>150.0')

## Check on the amount of artifacts and scans from the dataset

In [None]:
# Takes approx 12 min
dataset = glob2.glob(str(DATASET_PATH / 'qrcode/**/*.pcd'))
len(dataset)

In [None]:
print("Total no. of pointclouds in csv:", len(artifacts))
print("Total no. of pointclouds in dataset:", len(dataset))

In [None]:
## extracting the qrcodes
qrcodes_data = []
for path in dataset:
    data =path.split('/')
    qrcodes_data.append(data[4])
qrc_dataframe = pd.DataFrame(qrcodes_data,columns=['qrcodes'])

In [None]:
print("Total no. of scans in dataset:", len(qrc_dataframe['qrcodes'].unique()))
print("Total no. of scans in csv:", len(scans))      