In [1]:
# Remember to pip install pytest

import pandas as pd
from utils import read_cin_data
import pytest

collection_start = pd.to_datetime("01/04/2021", dayfirst=True)
collection_end = pd.to_datetime("31/03/2022", dayfirst=True)

(
    ChildCharacteristics,
    ChildIdentifiers,
    ChildProtectionPlans, # No data in here
    CINdetails,
    CINplanDates, # No data in here
    Header,
    Reviews, # No data in here
    Section47, # No data in here
) = read_cin_data()

We'll try rules as simply doing slices before we try doing them as functions, and before we write tests for them

# Rules to try:
- rule 4180: GenderCurrent (N00097) must be present and valid
- rule 4220: Ethnicity (N00177) must be present and a valid code
- rule 100: ReferenceDate (N00603) must be present and must equal 2022-03-31
- rule 1520: Each pupil UPN (N00001) must be unique across all pupils in the extract
- rule 8510: Each LAchildID (N00097) must be unique across all children within the same LA return. 
- rule 1540: If UPN (N00001) present Characters 5-12 of UPN must be numeric
- rule 8520: PersonBirthDate (N00066) must be on or before ReferenceDate (N00603) or null
- rule 8525Q: Either Date of Birth or Expected Date of Birth must be provided (but not both)
- rule 8600: Child referral date missing or after data collection period

# Harder but better practice (eg needing 2 or more tables):
- rule 8606: CINreferralDate (N00100) cannot be more than 280 days before PersonBirthDate (N00066) or ExpectedPersonBirthDate

# The following are great to do, but can only really be done with appropriate test_xxxx functions as there isn't sufficeint fake data to do them otherwise

In [9]:
# Lets do 4180 as an example.
# For the cin validator, we find failing rows and return the index locations of those rows as a list
# 1, 2, 0, 9

# ==
# >= <=
# < >
# !=
# |, &
# ~

df = ChildIdentifiers.copy()

allowed_genders = ["1", "2", "0", "9"]

condition = df['GenderCurrent'].isin(allowed_genders)

failing_rows = df[~condition].index

failing_indices = list(failing_rows)

failing_indices


[9, 21, 32, 45]

In [11]:
# rule 4220: Ethnicity (N00177) must be present and a valid code
eth_list = [
    "ABAN",
    "AIND",
    "AOTH",
    "APKN",
    "BAFR",
    "BCRB",
    "BOTH",
    "CHNE",
    "MOTH",
    "MWAS",
    "MWBA",
    "MWBC",
    "NOBT",
    "OOTH",
    "REFU",
    "WBRr",
    "WIRI",
    "WIRT",
    "WOTH",
    "WROM",
]

df = ChildCharacteristics.copy()

failing_indices = df[~(df['Ethnicity'].isin(eth_list))].index

failing_indices = list(failing_indices)

failing_indices

[24, 63, 78, 87, 89, 108, 221, 267, 268, 279, 308, 331]

In [14]:
# Either Date of Birth or Expected Date of Birth must be provided (but not both)
# .notna() checks there is not a na/ther eis a value
# .isna() checks there is a na/ there isn't a value
# | at least one is true
# & both are true

df = ChildIdentifiers.copy()

has_bd = df['PersonBirthDate'].notna()
has_ebd = df['ExpectedPersonBirthDate'].notna()

mega_condition = (has_bd | has_ebd) & ~(has_bd & has_ebd)

failing_row = df[~mega_condition]

failing_row




Unnamed: 0.1,Unnamed: 0,LAchildID,UPN,FormerUPN,UPNunknown,PersonBirthDate,ExpectedPersonBirthDate,GenderCurrent,PersonDeathDate
0,0,RND000215205141,A850728973744,,,NaT,NaT,1,NaT
1,1,RND000824303014,A141396438491,,,NaT,NaT,9,NaT
2,2,RND000750143123,A929946861554,,,NaT,NaT,1,NaT
3,3,RND000909164501,A612330267292,,,NaT,NaT,0,NaT
4,4,RND000382171815,A604459366806,,,NaT,NaT,2,NaT
...,...,...,...,...,...,...,...,...,...
327,327,RND000112711501,A465246916125,,,NaT,NaT,2,NaT
328,328,RND000513120794,A540014111973,,,NaT,NaT,2,NaT
329,329,RND000541643134,A549582689058,,,NaT,NaT,1,NaT
330,330,RND000404939452,A889492349196,,,NaT,NaT,2,NaT


Lets rewrite these as fucntions so we can pop them inside pytests and so it's more relevant to the validator code. We will do this in a test_XXXX.py file.