# Data Exploration notebook


In [1]:
import re
import os
import csv
import numpy as np
import pandas as pd
import plotly.express as px

## Correlations between inflation rate and national CPI

### Background Code

In [2]:
INFLATION_FILEPATH = "Processed Datasets/inflation.csv"
INTEREST_FILEPATH = "Processed Datasets/interest.csv"
PROCESSED_OUTPUT_DIR = 'Processed Datasets/'
PROCESSED_DATASET_SPLIT_STAGE_DIR = PROCESSED_OUTPUT_DIR + 'reformatted/'

In [3]:
inflation_data = pd.read_csv(INFLATION_FILEPATH, index_col=0)
nationwide_cpi_data = pd.read_csv(PROCESSED_DATASET_SPLIT_STAGE_DIR + "PHILIPPINES.csv", index_col=0)
interest_data = pd.read_csv(INTEREST_FILEPATH, index_col=0)

In [4]:
nationwide_cpi_data['Date'] = pd.to_datetime(nationwide_cpi_data['Date'], format='%Y-%m')
nationwide_cpi_data['Year'] = nationwide_cpi_data['Date'].dt.year
nationwide_cpi_data['Month'] = nationwide_cpi_data['Date'].dt.month
nationwide_cpi_data = nationwide_cpi_data[nationwide_cpi_data["Year"] > 2018]

print(nationwide_cpi_data)

         Date  Food_and_drinks  Alcohol_and_tobacco  \
12 2019-01-01            102.0                105.2   
13 2019-02-01            101.4                106.9   
14 2019-03-01            100.7                108.1   
15 2019-04-01            100.7                108.5   
16 2019-05-01            100.9                108.8   
..        ...              ...                  ...   
76 2024-05-01            128.7                176.8   
77 2024-06-01            129.5                177.0   
78 2024-07-01            130.4                177.0   
79 2024-08-01            130.4                177.2   
80 2024-09-01            129.8                177.5   

    Household_consumable_goods  Medication  Stationery  Year  Month  
12                       102.0       101.5       102.0  2019      1  
13                       102.2       101.6       102.1  2019      2  
14                       102.5       101.9       102.3  2019      3  
15                       102.7       102.1       102.6  201

In [5]:
interest_data = interest_data[interest_data["Year"] > 2018]
interest_data

Unnamed: 0,Date,Year,Month,Interest_rate
3,2019-01-01,2019,1,4.625000
4,2019-02-01,2019,2,4.666667
5,2019-03-01,2019,3,4.708333
6,2019-04-01,2019,4,4.750000
7,2019-05-01,2019,5,4.625000
...,...,...,...,...
67,2024-05-01,2024,5,6.000000
68,2024-06-01,2024,6,6.000000
69,2024-07-01,2024,7,6.000000
70,2024-08-01,2024,8,6.000000


In [6]:
inflation_data = inflation_data.sort_values(by=['Year', 'Month'], ascending=[True, True])
inflation_data

Unnamed: 0,Year,Month,Rate
0,2019,1,4.4
6,2019,2,3.8
12,2019,3,3.4
18,2019,4,3.2
24,2019,5,3.2
...,...,...,...
29,2024,5,3.9
35,2024,6,3.7
41,2024,7,4.4
47,2024,8,3.3


In [7]:
merged_data = pd.merge(nationwide_cpi_data, inflation_data, how='inner', left_on=['Year', 'Month'], right_on=['Year', 'Month'])
print(merged_data.isnull().sum())

Date                          0
Food_and_drinks               0
Alcohol_and_tobacco           0
Household_consumable_goods    0
Medication                    0
Stationery                    0
Year                          0
Month                         0
Rate                          0
dtype: int64


In [8]:
merged_data['Interest_rate'] = interest_data['Interest_rate']
print(merged_data.isnull().sum())

Date                          0
Food_and_drinks               0
Alcohol_and_tobacco           0
Household_consumable_goods    0
Medication                    0
Stationery                    0
Year                          0
Month                         0
Rate                          0
Interest_rate                 3
dtype: int64


In [18]:
merged_data

Unnamed: 0,Date,Food_and_drinks,Alcohol_and_tobacco,Household_consumable_goods,Medication,Stationery,Year,Month,Rate,Interest_rate
0,2019-01-01,102.0,105.2,102.0,101.5,102.0,2019,1,4.4,
1,2019-02-01,101.4,106.9,102.2,101.6,102.1,2019,2,3.8,
2,2019-03-01,100.7,108.1,102.5,101.9,102.3,2019,3,3.4,
3,2019-04-01,100.7,108.5,102.7,102.1,102.6,2019,4,3.2,4.625000
4,2019-05-01,100.9,108.8,102.9,102.4,103.0,2019,5,3.2,4.666667
...,...,...,...,...,...,...,...,...,...,...
64,2024-05-01,128.7,176.8,125.5,120.1,130.5,2024,5,3.9,6.000000
65,2024-06-01,129.5,177.0,125.6,120.3,130.8,2024,6,3.7,6.000000
66,2024-07-01,130.4,177.0,125.8,120.6,131.7,2024,7,4.4,6.000000
67,2024-08-01,130.4,177.2,126.0,120.7,133.3,2024,8,3.3,6.000000


In [10]:
df1 = merged_data[merged_data.isna().any(axis=1)]
df1

Unnamed: 0,Date,Food_and_drinks,Alcohol_and_tobacco,Household_consumable_goods,Medication,Stationery,Year,Month,Rate,Interest_rate
0,2019-01-01,102.0,105.2,102.0,101.5,102.0,2019,1,4.4,
1,2019-02-01,101.4,106.9,102.2,101.6,102.1,2019,2,3.8,
2,2019-03-01,100.7,108.1,102.5,101.9,102.3,2019,3,3.4,


In [None]:
merged_data.dropna(inplace=True)
merged_data

### Results

In [11]:
cpi_components = ['Food_and_drinks', 'Alcohol_and_tobacco', 'Household_consumable_goods', 'Medication', 'Stationery']

for component in cpi_components:
    fig = px.scatter(
        merged_data,
        x=component,
        y='Rate',
        title=f'Scatter Plot of {component} vs Inflation Rate | Correlation value: {np.corrcoef(merged_data["Rate"], merged_data[component])[0][1]}',
        labels={component: component, 'Rate': 'Inflation Rate'},
        trendline='ols'  # Add a trendline
    )
    fig.show()


### Correlation among different FMCG Products

In [12]:
cpi_components = ['Food_and_drinks', 'Alcohol_and_tobacco', 'Household_consumable_goods', 'Medication', 'Stationery']
FIRST_INDEX = 0
SECOND_INDEX = 1
fig = px.scatter(
        merged_data,
        x=cpi_components[FIRST_INDEX],
        y=cpi_components[SECOND_INDEX],
        title=f'Scatter Plot of {cpi_components[FIRST_INDEX]} vs {cpi_components[SECOND_INDEX]} | Correlation value: {np.corrcoef(merged_data[cpi_components[FIRST_INDEX]], merged_data[cpi_components[SECOND_INDEX]])[0][1]}',
        labels={component: component, 'Rate': 'Inflation Rate'},
        trendline='ols'  # Add a trendline
    )
fig.show()   

In [13]:
FIRST_INDEX = 2
SECOND_INDEX = 4
fig = px.scatter(
        merged_data,
        x=cpi_components[FIRST_INDEX],
        y=cpi_components[SECOND_INDEX],
        title=f'Scatter Plot of {cpi_components[FIRST_INDEX]} vs {cpi_components[SECOND_INDEX]} | Correlation value: {np.corrcoef(merged_data[cpi_components[FIRST_INDEX]], merged_data[cpi_components[SECOND_INDEX]])[0][1]}',
        labels={component: component, 'Rate': 'Inflation Rate'},
        trendline='ols'  # Add a trendline
    )
fig.show()   

In [14]:
FIRST_INDEX = 1
SECOND_INDEX = 3
fig = px.scatter(
        merged_data,
        x=cpi_components[FIRST_INDEX],
        y=cpi_components[SECOND_INDEX],
        title=f'Scatter Plot of {cpi_components[FIRST_INDEX]} vs {cpi_components[SECOND_INDEX]} | Correlation value: {np.corrcoef(merged_data[cpi_components[FIRST_INDEX]], merged_data[cpi_components[SECOND_INDEX]])[0][1]}',
        labels={component: component, 'Rate': 'Inflation Rate'},
        trendline='ols'  # Add a trendline
    )
fig.show()   

In [15]:
FIRST_INDEX = 3
SECOND_INDEX = 4
fig = px.scatter(
        merged_data,
        x=cpi_components[FIRST_INDEX],
        y=cpi_components[SECOND_INDEX],
        title=f'Scatter Plot of {cpi_components[FIRST_INDEX]} vs {cpi_components[SECOND_INDEX]} | Correlation value: {np.corrcoef(merged_data[cpi_components[FIRST_INDEX]], merged_data[cpi_components[SECOND_INDEX]])[0][1]}',
        labels={component: component, 'Rate': 'Inflation Rate'},
        trendline='ols'  # Add a trendline
    )
fig.show()   

### Correlation between interest rates and CPI

In [16]:
fig = px.scatter(
        merged_data,
        x=cpi_components[FIRST_INDEX],
        y="Interest_rate",
        title=f'Scatter Plot of {cpi_components[FIRST_INDEX]} vs {cpi_components[SECOND_INDEX]} | Correlation value: {np.corrcoef(merged_data[cpi_components[FIRST_INDEX]], merged_data[cpi_components[SECOND_INDEX]])[0][1]}',
        labels={component: component, 'Rate': 'Inflation Rate'},
        trendline='ols'  # Add a trendline
    )
fig.show()  