In [6]:
import pandas as pd

In [7]:
filepath = r"https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/1980%202023%20average%20house%20prices.csv"

df = pd.read_csv(filepath)

# for excel files, pd.read_excel(filepath, sheet_name = ["sheet name 1", "sheet name 2"])
# dfs will be a dictionary where each key is the sheet name and each value is the df containing 
# the table for that sheet
# dfs = {"sheet 1": df1,
#        "sheet 2:df2"}

df

# if print(df), shows how it would look in the terminal of a normal .py file (rather than a ipynb notebook)
# can go to 'jupyter' tab of terminal and see variables stored. Can double click to view entire dataframe in another tab

# to read from local files, need to import your operating system using the line import os

Unnamed: 0,Name,Period,House price index All property types,Average price All property types,Percentage change (monthly) All property types,Percentage change (yearly) All property types
0,United Kingdom,1980-01,10.11,19273,3.94,28.59
1,United Kingdom,1980-02,10.11,19273,3.94,28.59
2,United Kingdom,1980-03,10.11,19273,3.94,28.59
3,United Kingdom,1980-04,10.51,20044,4.00,24.15
4,United Kingdom,1980-05,10.51,20044,4.00,24.15
...,...,...,...,...,...,...
518,United Kingdom,2023-03,148.20,282548,-1.00,3.20
519,United Kingdom,2023-04,148.90,283871,0.50,2.50
520,United Kingdom,2023-05,149.50,285053,0.40,1.60
521,United Kingdom,2023-06,151.20,288281,1.10,1.90


In [8]:
# pd.to_datetime(df["Period"])
# remember case sensitive
# .to_datetime converts to date-time format
# want to write this to a column, not just view it
df["Period"] = pd.to_datetime(df["Period"], format="%Y-%m")
# uses American M-D-Y. Does its best to translate to dates but not perfect. 
# Can use dayfirst = True. But this time, year first so use format = 
# dates formatted as strf (https://strftime.org/)
df.info()
# can see that Period has been converted to datetime
df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 6 columns):
 #   Column                                          Non-Null Count  Dtype         
---  ------                                          --------------  -----         
 0   Name                                            523 non-null    object        
 1   Period                                          523 non-null    datetime64[ns]
 2   House price index All property types            523 non-null    float64       
 3   Average price All property types                523 non-null    int64         
 4   Percentage change (monthly) All property types  523 non-null    float64       
 5   Percentage change (yearly) All property types   523 non-null    float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 24.6+ KB


Unnamed: 0,Name,Period,House price index All property types,Average price All property types,Percentage change (monthly) All property types,Percentage change (yearly) All property types
0,United Kingdom,1980-01-01,10.11,19273,3.94,28.59
1,United Kingdom,1980-02-01,10.11,19273,3.94,28.59
2,United Kingdom,1980-03-01,10.11,19273,3.94,28.59
3,United Kingdom,1980-04-01,10.51,20044,4.00,24.15
4,United Kingdom,1980-05-01,10.51,20044,4.00,24.15
...,...,...,...,...,...,...
518,United Kingdom,2023-03-01,148.20,282548,-1.00,3.20
519,United Kingdom,2023-04-01,148.90,283871,0.50,2.50
520,United Kingdom,2023-05-01,149.50,285053,0.40,1.60
521,United Kingdom,2023-06-01,151.20,288281,1.10,1.90


In [14]:
import pandas as pd
child_identifiers_filepath = r"https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildIdentifiers.csv"
child_identifiers = pd.read_csv(child_identifiers_filepath)
# child_identifiers
# child_identifiers.info()
# child_identifiers.head()
# child_identifiers.tail(10)

# could get rid of unnamed and empty rows, might want to change 
# birth date to date object, change gender from integer to object
child_identifiers["PersonBirthDate"] = pd.to_datetime(child_identifiers["PersonBirthDate"], format="%Y-%m-%d")
child_identifiers["ExpectedPersonBirthDate"] = pd.to_datetime(child_identifiers["ExpectedPersonBirthDate"], format="%Y-%m-%d")
child_identifiers["PersonDeathDate"] = pd.to_datetime(child_identifiers["PersonDeathDate"], format = "%Y-%m-%d", errors = "coerce")
# in the above one, had to add errors clause. Can either ask to ignore errors 
# (leave as not a date-time object) or coerce them (puts N/A in cell)
child_identifiers.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Unnamed: 0               332 non-null    int64         
 1   LAchildID                332 non-null    object        
 2   UPN                      332 non-null    object        
 3   FormerUPN                0 non-null      float64       
 4   UPNunknown               0 non-null      float64       
 5   PersonBirthDate          332 non-null    datetime64[ns]
 6   ExpectedPersonBirthDate  7 non-null      datetime64[ns]
 7   GenderCurrent            332 non-null    int64         
 8   PersonDeathDate          20 non-null     datetime64[ns]
dtypes: datetime64[ns](3), float64(2), int64(2), object(2)
memory usage: 23.5+ KB


In [None]:
house_prices_sum = df["Average price All property types"].sum()
print(f"The sum of all house prices across time is {house_prices_sum}.")
house_price_mean = df["Average price All property types"].mean()
house_price_median = df["Average price All property types"].median()
print(f"The mean house price since 1980 is: {int(house_price_mean)}. THe median house price since 1980 is {int(house_price_median)}.")

# int() changes data type from float to string
# .cumsum = cumulative sum
print(f"The lowest house price since 1980 was {df['Average price All property types'].min()}.")
print(f"The highest house price since 1980 was: {df['Average price All property types'].min()}")

#note different " and ' - because otherwise matches first " to second "

The sum of all house prices across time is 63058402.
The mean house price since 1980 is: 120570. THe median house price since 1980 is 97964.
The lowest house price since 1980 was 19273.
The highest house price since 1980 was: 19273


In [None]:
df["Age of data"] = pd.to_datetime("today") - df["Period"]
#to find age of data from today; recognises today in a string
# data type = timedelta. Good for calculations but we just want an integer for plotting etc.
# df["Age of data"] = df["Age of data"] / pd.Timedelta(1, "d") to get days
df["Age of data"] = df["Age of data"] / pd.Timedelta(365.25, "d")
# df["Age of data"] = df["Age of data"] / np.timedelta64(1, Y)
df["Age of data"] = df["Age of data"].astype("int")
df["Age of data"].info()


<class 'pandas.core.series.Series'>
RangeIndex: 523 entries, 0 to 522
Series name: Age of data
Non-Null Count  Dtype
--------------  -----
523 non-null    int64
dtypes: int64(1)
memory usage: 4.2 KB


In [None]:
# use child_identifiers['PersonBirthDate'] to calculate the age of every child in a new column

child_identifiers["PersonBirthDate"] = pd.to_datetime(child_identifiers["PersonBirthDate"], format="%Y-%m-%d")
child_identifiers["Child_Age"] = pd.to_datetime("today") - child_identifiers["PersonBirthDate"]
# if wanted a specific date, can do ("01/04/2024", format = "%d/%m/%Y")
child_identifiers["Child_Age"] = child_identifiers["Child_Age"] / pd.Timedelta(365.25, "d")

child_identifiers["Child_Age"] = child_identifiers["Child_Age"].astype("int")
child_identifiers


Unnamed: 0.1,Unnamed: 0,LAchildID,UPN,FormerUPN,UPNunknown,PersonBirthDate,ExpectedPersonBirthDate,GenderCurrent,PersonDeathDate,Child_Age
0,0,RND000215205141,A850728973744,,,2019-12-06,NaT,1,,4
1,1,RND000824303014,A141396438491,,,2011-04-27,NaT,9,,13
2,2,RND000750143123,A929946861554,,,2017-06-06,2019-12-06,1,,7
3,3,RND000909164501,A612330267292,,,2014-10-03,NaT,0,,9
4,4,RND000382171815,A604459366806,,,2019-09-25,NaT,2,,4
...,...,...,...,...,...,...,...,...,...,...
327,327,RND000112711501,A465246916125,,,2010-07-07,NaT,2,,13
328,328,RND000513120794,A540014111973,,,2018-08-14,NaT,2,,5
329,329,RND000541643134,A549582689058,,,2021-12-09,NaT,51,,2
330,330,RND000404939452,A889492349196,,,2013-07-23,NaT,2,,10


In [None]:
above_average_price = df["Average price All property types"] > df["Average price All property types"].mean()
# above_average_price
above_av_df = df[above_average_price]
above_av_df
#written logical condition, creates new dataframe where logical condition is "true"
# CIN validation tool makes extensive use of these kind of logical conditions

Unnamed: 0,Name,Period,House price index All property types,Average price All property types,Percentage change (monthly) All property types,Percentage change (yearly) All property types,Age of data
274,United Kingdom,2002-11-01,63.83,121700,1.01,24.23,21
275,United Kingdom,2002-12-01,65.95,125747,3.33,28.36,21
276,United Kingdom,2003-01-01,65.47,124836,-0.72,27.88,21
277,United Kingdom,2003-02-01,64.78,123521,-1.05,22.10,21
278,United Kingdom,2003-03-01,66.16,126152,2.13,20.48,21
...,...,...,...,...,...,...,...
518,United Kingdom,2023-03-01,148.20,282548,-1.00,3.20,1
519,United Kingdom,2023-04-01,148.90,283871,0.50,2.50,1
520,United Kingdom,2023-05-01,149.50,285053,0.40,1.60,1
521,United Kingdom,2023-06-01,151.20,288281,1.10,1.90,1


In [16]:
# Merging dataframes
# list of dictionaries rather than dictionary of lists

df_1 = pd.DataFrame({'ChildId':['id1', 'id2', 'id3', 'id4', 'id5'],
                   'Age first contact':[6,12,11,1,19],
                   'Gender':['M','m', 'F', '', 'F' ],
                   'Birthday':['01/01/2002', '02/02/2003', pd.NA, '03/03/2023', '06/01/2012'],
                   'CP Plan?':['Y', 'n', 'N', 'No', 'yES'],})

nhs_numbers = pd.DataFrame([
                            {'ChildId':'id1',
                            'NHS Number': '303',},
                            {'ChildId':'id3',
                            'NHS Number': 'gqw3',},
                            {'ChildId':'id2',
                            'NHS Number': '3u5029',},
                            {'ChildId':'id4',
                            'NHS Number': 'avsgvb',},
                            {'ChildId':'id5',
                            'NHS Number': 'varwvw',},
                            ])

child_info = pd.merge(df_1, nhs_numbers, left_on='ChildId', right_on='ChildId', how='left')

child_info

Unnamed: 0,ChildId,Age first contact,Gender,Birthday,CP Plan?,NHS Number
0,id1,6,M,01/01/2002,Y,303
1,id2,12,m,02/02/2003,n,3u5029
2,id3,11,F,,N,gqw3
3,id4,1,,03/03/2023,No,avsgvb
4,id5,19,F,06/01/2012,yES,varwvw
