# UK Inflation (Consumer Prices Index by Month)

In [2]:
import pandas as pd
inflation_df = pd.read_excel('../../Datasets/01_raw_data_files/CPIannualrate.xls')

## Data Cleaning:

In [3]:
# Visualise the data frame
inflation_df

Unnamed: 0,Title,CPIH ANNUAL RATE 00: ALL ITEMS 2015=100
0,CDID,L55O
1,Source dataset ID,MM23
2,PreUnit,
3,Unit,%
4,Release date,19-07-2023
...,...,...
588,2023 FEB,9.2
589,2023 MAR,8.9
590,2023 APR,7.8
591,2023 MAY,7.9


In [4]:
# Brief overview of the dataset
inflation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593 entries, 0 to 592
Data columns (total 2 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Title                                    593 non-null    object
 1   CPIH ANNUAL RATE 00: ALL ITEMS 2015=100  591 non-null    object
dtypes: object(2)
memory usage: 9.4+ KB


### Problems to be cleaned:
* Rows 0-6 are not relevant data to us
* Data types in both columns are objects
* Dataset contains yearly, quartery and monthly data (we only want monthly)
* We are only focusing on 2006-present
* Rename the column headings to be more consise

In [5]:
# Drop rows 0 to 6
inflation_df2 = inflation_df.copy()

inflation_df2.drop(0, inplace=True, axis=0)
inflation_df2.drop(1, inplace=True, axis=0)
inflation_df2.drop(2, inplace=True, axis=0)
inflation_df2.drop(3, inplace=True, axis=0)
inflation_df2.drop(4, inplace=True, axis=0)
inflation_df2.drop(5, inplace=True, axis=0)
inflation_df2.drop(6, inplace=True, axis=0)

In [6]:
inflation_df2.head()

Unnamed: 0,Title,CPIH ANNUAL RATE 00: ALL ITEMS 2015=100
7,1989,5.7
8,1990,8.0
9,1991,7.5
10,1992,4.6
11,1993,2.6


In [7]:
# Remove all rows outside of 2006 Jan - 2022 Dec
inflation_df2 = inflation_df2.iloc[376:580]

# Reset index accordingly and check correct time frame has been selected for the new dataframe
inflation_df2 = inflation_df2.reset_index(drop = True)
inflation_df2

Unnamed: 0,Title,CPIH ANNUAL RATE 00: ALL ITEMS 2015=100
0,2006 JAN,2.2
1,2006 FEB,2.2
2,2006 MAR,2
3,2006 APR,2.2
4,2006 MAY,2.4
...,...,...
199,2022 AUG,8.6
200,2022 SEP,8.8
201,2022 OCT,9.6
202,2022 NOV,9.3


In [8]:
# Change column names
inflation_df2 = inflation_df2.rename(columns={'Title': 'Date', 'CPIH ANNUAL RATE 00: ALL ITEMS 2015=100': 'CPIH Annual Rate'})

In [9]:
# Convert data in 'Date' column from an object to a date
inflation_df2['Date'] = pd.to_datetime(inflation_df2['Date'])

# Convert data in 'CPIH Annual Rate' column from an object to a float
inflation_df2['CPIH Annual Rate'] = pd.to_numeric(inflation_df2['CPIH Annual Rate'])

In [10]:
# Check data types are correct
inflation_df2.dtypes

Date                datetime64[ns]
CPIH Annual Rate           float64
dtype: object

In [11]:
# Visualise the clean data frame
inflation_df2

Unnamed: 0,Date,CPIH Annual Rate
0,2006-01-01,2.2
1,2006-02-01,2.2
2,2006-03-01,2.0
3,2006-04-01,2.2
4,2006-05-01,2.4
...,...,...
199,2022-08-01,8.6
200,2022-09-01,8.8
201,2022-10-01,9.6
202,2022-11-01,9.3


### Descriptive Analysis:

In [12]:
inflation_df2.describe()

Unnamed: 0,CPIH Annual Rate
count,204.0
mean,2.493137
std,1.70278
min,0.2
25%,1.6
50%,2.3
75%,2.8
max,9.6


* The mean CPIH annual rate is 2.49
* The min CPIH annual rate is 0.200
* The max CPIH annual rate is 9.60

In [None]:
# Save the cleaned dataframe as a .csv
# inflation_df2.to_csv('monthly_cpih.csv')