# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import time
from datasets import *

# Data loading

In [2]:
data = load_df_dogs_2016(fixErrors=False)
data.shape

(161, 31)

In [3]:
data.head(5)

Unnamed: 0,Folder,IP,IP Gravity,Vrig Tric,Birth date,First visit,Age,Therapy started,Dead,Date of death,...,Asx/Ao,E,E/A,FE %,FS %,EDVI,ESVI,Allo diast,Allo sist,Therapy to visit
0,S0601,0,0,0.0,810770000.0,1137110000.0,10.347945,1137110000.0,0.0,1137110000.0,...,2.495146,1.15,0.974576,70.0,37.0,78.686406,23.746696,1.584502,0.988779,0
1,C0621,0,0,0.0,734825000.0,1214340000.0,15.205479,1214340000.0,1.0,1243810000.0,...,1.609023,1.5,1.470588,83.0,51.0,130.584237,23.876482,1.942761,0.986143,0
2,B0918,0,0,0.0,820451000.0,1317770000.0,15.769863,1317770000.0,1.0,1327880000.0,...,2.504202,1.87,2.077778,91.0,62.0,180.348219,16.221999,2.213868,0.841053,0
3,R1009,0,0,0.0,909875000.0,1336950000.0,13.542466,1336950000.0,1.0,1344980000.0,...,3.071429,2.28,2.85,75.0,44.0,234.165258,40.151793,2.094778,1.163766,0
4,R1513,0,0,0.0,1092090000.0,1427840000.0,10.720548,1412110000.0,0.0,1485390000.0,...,2.536,0.75,0.862069,69.0,38.0,139.835879,43.88668,1.981413,1.230353,182


# Consistency checks

In [4]:
nErrors, nWrongBirth, nWrongDeath, nTherapyIncons = 0, 0, 0, 0
print("Verifying dates consistency\n")

for i, row in data.iterrows():
    birth, fvisit, therapy, death = row["Birth date"], row["First visit"], row["Therapy started"], row["Date of death"]
    if not birth <= fvisit <= therapy <= death:
        #print("Inconsistency in row %d. Birth: %s, 1° visit: %s, therapy started: %s, death: %s" % (i, row["Birth date"], row["First visit"], row["Therapy started"], row["Date of death"]))
        nErrors += 1
    if birth > fvisit or birth > therapy or birth > death:
        nWrongBirth += 1
    if death < birth or death < fvisit or death < therapy:
        nWrongDeath += 1
    if fvisit > therapy:
        nTherapyIncons += 1
        
print("Total errors: %d\nInconsistent birth dates: %d\nInconsistent death dates: %d\nTherapy started before first visit: %d" % (nErrors, nWrongBirth, nWrongDeath, nTherapyIncons))

Verifying dates consistency

Total errors: 42
Inconsistent birth dates: 0
Inconsistent death dates: 0
Therapy started before first visit: 42


All detected inconsistencies are related to therapies started before the first visit. Could possibly refer to therapies started with other clinics. Therefore potentially not an error, although the current survival time measure does not take this into account and it may be useful to try computing one that does.

In [5]:
nErrors, totDaysError = 0, 0
print("Verifying consistency in survival time (calculated from first visit)\n")

for i, row in data.iterrows():
    fvisit, death, survtime = row["First visit"], row["Date of death"], row["Survival time"]
    visitsurv = (death-fvisit)/(3600*24)
    #Not caring about rounding errors
    if visitsurv < survtime-1 or visitsurv > survtime+1:
        print("Inconsistency in row %d.\nFirst visit: %s, Death: %s, ComputedSurvTime: %d, DataSurvTime: %d" % (i, time.strftime('%Y-%m-%d', time.localtime(fvisit)), time.strftime('%Y-%m-%d', time.localtime(death)), visitsurv, survtime))
        nErrors += 1
        totDaysError += (visitsurv-survtime)

print("\nTotal errors: %d\nAverage days apart: %d" % (nErrors, totDaysError/nErrors if nErrors!=0 else 0))

Verifying consistency in survival time (calculated from first visit)

Inconsistency in row 11.
First visit: 2011-06-08, Death: 2013-06-27, ComputedSurvTime: 750, DataSurvTime: 1115

Total errors: 1
Average days apart: -365


Survival time values were written by hand, so the detected error was probably the result of misreading the year of an entry.

In [6]:
nErrors = 0
print("Verifying that Cardiac Arrest implies Death\n")

for i, row in data.iterrows():
    dead, mc = row["Dead"], row["MC"]
    if mc and not dead:
        nErrors += 1

print("Number of cardiac arrest inconsistencies: %d" % nErrors)

Verifying that Cardiac Arrest implies Death

Number of cardiac arrest inconsistencies: 0


In [7]:
nErrors = 0
print("Verifying consistency of the Therapy category in relation to prescriptions\n")

for i, row in data.iterrows():
    therapy, nPrescr = row["Therapy Category"], row["Furosemide"]+row["Ache-i"]+row["Pimobendan"]+row["Spironolattone"]
    if not therapy == nPrescr:
        nErrors += 1

print("Number of Therapy Category inconsistencies: %d" % nErrors)

Verifying consistency of the Therapy category in relation to prescriptions

Number of Therapy Category inconsistencies: 0


In [8]:
nErrors, totYearsError = 0, 0
print("Verifying consistency of Age\n")

for i, row in data.iterrows():
    birth, fvisit, age = row["Birth date"], row["First visit"], row["Age"]
    realage = (fvisit-birth)/(3600*24*365)
    #Not caring about rounding errors
    if realage < age-0.5 or realage > age+0.5:
        #print("Inconsistency in row %d.\nBirth: %s, First visit: %s, ComputedAge: %d, DataAge: %d" % (i, time.strftime('%Y-%m-%d', time.localtime(birth)), time.strftime('%Y-%m-%d', time.localtime(fvisit)), realage, age))
        nErrors += 1
        totYearsError += (realage-age)

print("Total errors: %d\nAverage years apart: %f" % (nErrors, totYearsError/nErrors if nErrors!=0 else 0))

Verifying consistency of Age

Total errors: 50
Average years apart: 0.877043
