In [1]:
# Recap
import pandas as pd
import os # used to access files on local system

# r is used to read all of string explicitly
ci_filepath = r"https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildIdentifiers.csv"
cc_filepath = r"https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildCharacteristics.csv"

identifiers = pd.read_csv(ci_filepath)
characteristics = pd.read_csv(cc_filepath)

 

In [2]:
# .info is basic way of looking at data
#identifiers.info()
# characteristics.info()

# .head is top x rows, 5 as default
#identifiers.head()

#convert to date
identifiers['PersonBirthDate_dt'] = pd.to_datetime(identifiers['PersonBirthDate'], format = "%Y-%m-%d")


identifiers['ExpectedPersonBirthDate_dt'] = pd.to_datetime(identifiers['ExpectedPersonBirthDate'], format = "%Y-%m-%d")

identifiers['PersonDeathDate_dt'] = pd.to_datetime(identifiers['PersonDeathDate'], format = "%Y-%m-%d", errors='coerce') # coerce to return null for errors

identifiers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Unnamed: 0                  332 non-null    int64         
 1   LAchildID                   332 non-null    object        
 2   UPN                         332 non-null    object        
 3   FormerUPN                   0 non-null      float64       
 4   UPNunknown                  0 non-null      float64       
 5   PersonBirthDate             332 non-null    object        
 6   ExpectedPersonBirthDate     7 non-null      object        
 7   GenderCurrent               332 non-null    int64         
 8   PersonDeathDate             21 non-null     object        
 9   PersonBirthDate_dt          332 non-null    datetime64[ns]
 10  ExpectedPersonBirthDate_dt  7 non-null      datetime64[ns]
 11  PersonDeathDate_dt          20 non-null     datetime64[ns]

In [3]:
# Making DataFrame from scratch - sometimes makesense to hardcode e.g. - lookup table

# Method 1 - dictionary of lists
child_info = pd.DataFrame({'ChildId':['id1', 'id2', 'id3', 'id4', 'id5'],
                           'Age at first contact':[6, 12, 11, 1, 19],
                           'SEX':['M', 'm', 'F', '', 'F'],
                           'Birthday':['01/01/2002', '02/02/2003', '01/01/2000', '03/03/2023', '06/01/2012'],
                           'CP Plan?':['Y', 'n', 'N', 'No', 'yES'],})

child_info



# Method 2 - list of dictionaries
nhs_numbers = pd.DataFrame([{'ChildId':'id1',
                             'NHS Number':'303'},
                             {'ChildId':'id2',
                              'NHS Number':'3u5029'},
                            {'ChildId':'id3',
                            'NHS Number': 'gqw3',},
                            {'ChildId':'id4',
                            'NHS Number': 'avsgvb',},
                            {'ChildId':'id5',
                            'NHS Number': 'varwvw',},])

nhs_numbers


Unnamed: 0,ChildId,NHS Number
0,id1,303
1,id2,3u5029
2,id3,gqw3
3,id4,avsgvb
4,id5,varwvw


In [4]:
# Calculating ages
# Could import dateutil but we only want to use one element therefore
from dateutil.relativedelta import relativedelta

child_info['Birthday_dt'] = pd.to_datetime(child_info['Birthday'], dayfirst=True)

# Need to calculate age on a row wise basis
child_info['Age'] = child_info['Birthday_dt'].apply(lambda row: relativedelta(pd.to_datetime("today"), row).years)

child_info

Unnamed: 0,ChildId,Age at first contact,SEX,Birthday,CP Plan?,Birthday_dt,Age
0,id1,6,M,01/01/2002,Y,2002-01-01,23
1,id2,12,m,02/02/2003,n,2003-02-02,22
2,id3,11,F,01/01/2000,N,2000-01-01,25
3,id4,1,,03/03/2023,No,2023-03-03,2
4,id5,19,F,06/01/2012,yES,2012-01-06,13


In [5]:
# Exercise 1 - add age column to identifiers column
identifiers['Age'] = identifiers['PersonBirthDate_dt'].apply(lambda x: relativedelta(pd.to_datetime("today"), x).years)
identifiers

Unnamed: 0.1,Unnamed: 0,LAchildID,UPN,FormerUPN,UPNunknown,PersonBirthDate,ExpectedPersonBirthDate,GenderCurrent,PersonDeathDate,PersonBirthDate_dt,ExpectedPersonBirthDate_dt,PersonDeathDate_dt,Age
0,0,RND000215205141,A850728973744,,,2019-12-06,,1,,2019-12-06,NaT,NaT,5
1,1,RND000824303014,A141396438491,,,2011-04-27,,9,,2011-04-27,NaT,NaT,14
2,2,RND000750143123,A929946861554,,,2017-06-06,2019-12-06,1,,2017-06-06,2019-12-06,NaT,8
3,3,RND000909164501,A612330267292,,,2014-10-03,,0,,2014-10-03,NaT,NaT,11
4,4,RND000382171815,A604459366806,,,2019-09-25,,2,,2019-09-25,NaT,NaT,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,327,RND000112711501,A465246916125,,,2010-07-07,,2,,2010-07-07,NaT,NaT,15
328,328,RND000513120794,A540014111973,,,2018-08-14,,2,,2018-08-14,NaT,NaT,7
329,329,RND000541643134,A549582689058,,,2021-12-09,,51,,2021-12-09,NaT,NaT,3
330,330,RND000404939452,A889492349196,,,2013-07-23,,2,,2013-07-23,NaT,NaT,12


In [6]:
# Reading Excel files (don't always have one sheet)

#pandas doesn't have read excel engine as default, so need to install in terminal
import openpyxl
filepath = r"https://github.com/data-to-insight/ERN-sessions/raw/main/data/small%20excel.xlsx"


df_dict = pd.read_excel(filepath, sheet_name=None) # as default reads first sheet, None imports all sheets

In [7]:
# Joining tables - can do more complex joins than PBI (and quicker)
print(child_info.columns)
print(nhs_numbers.columns)

child_info = pd.merge(child_info, nhs_numbers, left_on='ChildId', right_on="ChildId", how='left')
child_info

Index(['ChildId', 'Age at first contact', 'SEX', 'Birthday', 'CP Plan?',
       'Birthday_dt', 'Age'],
      dtype='object')
Index(['ChildId', 'NHS Number'], dtype='object')


Unnamed: 0,ChildId,Age at first contact,SEX,Birthday,CP Plan?,Birthday_dt,Age,NHS Number
0,id1,6,M,01/01/2002,Y,2002-01-01,23,303
1,id2,12,m,02/02/2003,n,2003-02-02,22,3u5029
2,id3,11,F,01/01/2000,N,2000-01-01,25,gqw3
3,id4,1,,03/03/2023,No,2023-03-03,2,avsgvb
4,id5,19,F,06/01/2012,yES,2012-01-06,13,varwvw


In [8]:
# Exercise 2 - join identifiers
#print(characteristics.columns)
#print(identifiers.columns)

#child_info = pd.merge(characteristics, identifiers, left_on='LAchildID', right_on="LAchildID", how='left')
#child_info

#identifiers = identifiers.merge(characteristics, on=['LAchildID', 'Unnamed: 0'], how='left') # simpler method to join


#identifiers = identifiers.merge(characteristics, on='LAchildID', how='left', suffixes=['_ident', '_char'])
# to stop duplicating columns
#identifiers = identifiers.merge(characteristics, on=['LAchildID', 'Unnamed: 0'], how='left')
#identifiers

# merging 3 tables
#df_dict.keys()
 
df = df_dict['Child Data'].merge(df_dict['Child extras'], on='ChildID', how='left')
df = df.merge(df_dict['Sheet3'], on='Assessment Code', how='left')
 
df

Unnamed: 0,ChildID,Sex,DOB,EHCP issued,Assessment Code,Assessment Outcome
0,id1,M,2014-12-05,N,1,Y
1,id6,m,2016-01-06,n,5,y
2,id4,m,2013-11-09,Y,3,N
3,id3,F,2022-12-24,Y,4,n
4,id5,F,2020-01-15,,2,N
5,id7,f,2022-09-17,y,6,N


In [None]:
# Simple logical conditions
child_info

under_5_cond = child_info['Age'] < 5 # setting up condition to be tested
under_5_df = child_info[under_5_cond]

under_5_df

over_5_df = child_info[~under_5_cond] # negates clause - need to be careful using this
over_5_df

Unnamed: 0,ChildId,Age at first contact,SEX,Birthday,CP Plan?,Birthday_dt,Age,NHS Number
0,id1,6,M,01/01/2002,Y,2002-01-01,23,303
1,id2,12,m,02/02/2003,n,2003-02-02,22,3u5029
2,id3,11,F,01/01/2000,N,2000-01-01,25,gqw3
4,id5,19,F,06/01/2012,yES,2012-01-06,13,varwvw


In [20]:
# Multiple conditions | is or, & is and
# Over 5 and under 20
over_5 = child_info['Age'] >= 5
under_20 = child_info['Age'] < 20
#print(over_5)
#print(under_20)

condition = over_5 & under_20
between_5_20 = child_info[condition]

between_5_20





Unnamed: 0,ChildId,Age at first contact,SEX,Birthday,CP Plan?,Birthday_dt,Age,NHS Number
4,id5,19,F,06/01/2012,yES,2012-01-06,13,varwvw
