# 🧩 Variable preparation for EDA

In [1]:
# Import all necessery libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
from scipy import stats
import matplotlib.style as style
import missingno as msno

from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax, skew
style.use('fivethirtyeight')

In [4]:
# Importing cleaned data set
df=pd.read_csv("cleaned_dataset.csv")
df

Unnamed: 0,Year,Month,Inflation_Rate,Crude Oil Price,Production,Crude Oil Export,CPI_Food,CPI_Energy,CPI_Health,CPI_Transport,CPI_Communication,CPI_Education
0,2008,Jan,8.60,94.26,2.17,1.72,75.534431,84.612846,83.863139,86.530038,83.129440,75.284466
1,2008,Feb,8.00,98.15,2.08,1.63,75.154185,85.231632,83.365891,88.355236,82.889814,75.457619
2,2008,Mar,7.80,103.73,2.06,1.61,78.242523,83.251516,86.082993,84.376304,81.893471,74.826847
3,2008,Apr,8.20,116.73,1.96,1.51,79.434268,84.348830,89.445332,88.600334,93.937865,73.664248
4,2008,May,9.70,126.57,2.05,1.60,80.783677,85.574027,90.540461,87.932833,97.553285,74.880442
...,...,...,...,...,...,...,...,...,...,...,...,...
193,2024,Feb,31.70,86.08,1.32,0.87,846.846533,543.893969,494.844568,570.034088,234.228948,469.899712
194,2024,Mar,33.20,88.80,1.23,0.78,877.472863,558.813016,502.912687,579.821453,234.694654,476.890292
195,2024,Apr,33.69,93.12,1.28,0.83,899.453386,571.960431,510.095879,592.282753,235.335975,483.719840
196,2024,May,33.95,84.01,1.25,0.80,920.005281,583.894158,517.391784,606.408076,235.850405,490.779792


# 🟡 Variable Classification

 Now to continue the analysis I need to classify the variable (coloums) present into Preditor Variable Vs Target Variable

 1. Predictor Variables: The variables that might influence or have a relationship with the target variable. They're used to explore patterns, trends, and potential causes.

 2. Target Variable: The variable you're trying to understand or explain. In EDA, it's the main outcome or focus of the analysis.

##### 🎯 Target Variable:
 Since your goal is to understand the drivers of inflation, the target variable is:

 *Inflation_Rate

 This is the variable i'm exploring, how it changes, what influences it, and what patterns it follows.

 🔍 Predictor Variables:
 These are the variables i’ll analyze to see how they relate to Inflation_Rate.

 1. Temporal Variables (for trend & seasonality analysis):
 -- Year

 -- Month

 2. Economic Indicators:
 -- Crude Oil Price

 -- Production

 -- Crude Oil Export

 3. Consumer Price Indices (CPI):
 -- CPI_Food

 -- CPI_Energy

 -- CPI_Health

 -- CPI_Transport

 -- CPI_Communication

 -- CPI_Education

 These will help identify which sectors have the most influence on inflation.

# Types of Variables 

##### Here I identify the type of the predictor and target variable we have and classify it into 'Numerial' of 'Categorical' variable, this helps understand what better statistical appraoch we'll be taking with the present variables

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               198 non-null    int64  
 1   Month              198 non-null    object 
 2   Inflation_Rate     198 non-null    float64
 3   Crude Oil Price    197 non-null    float64
 4   Production         197 non-null    float64
 5   Crude Oil Export   197 non-null    float64
 6   CPI_Food           198 non-null    float64
 7   CPI_Energy         198 non-null    float64
 8   CPI_Health         198 non-null    float64
 9   CPI_Transport      198 non-null    float64
 10  CPI_Communication  198 non-null    float64
 11  CPI_Education      198 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 18.7+ KB


##### Notice most of my variables are numerical both the Target and the Predictor variables which isn't the most idle but sufficient for the insights we look to gain

##### *I'm going now convert the 'Year' and 'Month' variable into a single datetime type, this will be incredibly useful when I want to do time series plots and so on.

In [7]:
df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month'].astype(str))
df

  df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month'].astype(str))


Unnamed: 0,Year,Month,Inflation_Rate,Crude Oil Price,Production,Crude Oil Export,CPI_Food,CPI_Energy,CPI_Health,CPI_Transport,CPI_Communication,CPI_Education,Date
0,2008,Jan,8.60,94.26,2.17,1.72,75.534431,84.612846,83.863139,86.530038,83.129440,75.284466,2008-01-01
1,2008,Feb,8.00,98.15,2.08,1.63,75.154185,85.231632,83.365891,88.355236,82.889814,75.457619,2008-02-01
2,2008,Mar,7.80,103.73,2.06,1.61,78.242523,83.251516,86.082993,84.376304,81.893471,74.826847,2008-03-01
3,2008,Apr,8.20,116.73,1.96,1.51,79.434268,84.348830,89.445332,88.600334,93.937865,73.664248,2008-04-01
4,2008,May,9.70,126.57,2.05,1.60,80.783677,85.574027,90.540461,87.932833,97.553285,74.880442,2008-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,2024,Feb,31.70,86.08,1.32,0.87,846.846533,543.893969,494.844568,570.034088,234.228948,469.899712,2024-02-01
194,2024,Mar,33.20,88.80,1.23,0.78,877.472863,558.813016,502.912687,579.821453,234.694654,476.890292,2024-03-01
195,2024,Apr,33.69,93.12,1.28,0.83,899.453386,571.960431,510.095879,592.282753,235.335975,483.719840,2024-04-01
196,2024,May,33.95,84.01,1.25,0.80,920.005281,583.894158,517.391784,606.408076,235.850405,490.779792,2024-05-01


In [None]:
# checking if the dataTime conversion was done correctly
df.head(13)


Unnamed: 0,Year,Month,Inflation_Rate,Crude Oil Price,Production,Crude Oil Export,CPI_Food,CPI_Energy,CPI_Health,CPI_Transport,CPI_Communication,CPI_Education,Date
0,2008,Jan,8.6,94.26,2.17,1.72,75.534431,84.612846,83.863139,86.530038,83.12944,75.284466,2008-01-01
1,2008,Feb,8.0,98.15,2.08,1.63,75.154185,85.231632,83.365891,88.355236,82.889814,75.457619,2008-02-01
2,2008,Mar,7.8,103.73,2.06,1.61,78.242523,83.251516,86.082993,84.376304,81.893471,74.826847,2008-03-01
3,2008,Apr,8.2,116.73,1.96,1.51,79.434268,84.34883,89.445332,88.600334,93.937865,73.664248,2008-04-01
4,2008,May,9.7,126.57,2.05,1.6,80.783677,85.574027,90.540461,87.932833,97.553285,74.880442,2008-05-01
5,2008,Jun,12.0,138.74,2.02,1.57,84.498029,88.387443,91.623749,88.266583,96.468659,78.40122,2008-06-01
6,2008,Jul,14.0,137.74,2.13,1.68,86.709947,90.796584,93.476588,89.658949,97.284231,80.874835,2008-07-01
7,2008,Aug,12.4,115.84,2.11,1.66,87.716207,92.51681,92.422897,88.615978,97.292639,83.703001,2008-08-01
8,2008,Sep,13.0,103.82,2.17,1.72,88.453513,94.100903,92.316344,88.292657,97.734056,83.08872,2008-09-01
9,2008,Oct,14.7,75.31,2.26,1.81,88.017621,93.292356,93.251643,88.777639,97.288435,84.655343,2008-10-01


# Target Variable Overview

In [None]:
df['Inflation_Rate'].describe()

count    198.000000
mean      14.068232
std        5.456106
min        7.700000
25%       10.625000
50%       12.735000
75%       16.040000
max       34.190000
Name: Inflation_Rate, dtype: float64

In [None]:
# to check the maximum value
df['Inflation_Rate'].max()

34.19

In [None]:
# to check the minimum value
df['Inflation_Rate'].min()

7.7

In [13]:
df['Inflation_Rate'].info()
df['Inflation_Rate'].isnull().sum()

<class 'pandas.core.series.Series'>
RangeIndex: 198 entries, 0 to 197
Series name: Inflation_Rate
Non-Null Count  Dtype  
--------------  -----  
198 non-null    float64
dtypes: float64(1)
memory usage: 1.7 KB


0

# Predictor Variables Overview

In [14]:
predictors = ['Crude Oil Price', 'Production', 'Crude Oil Export',
              'CPI_Food', 'CPI_Energy', 'CPI_Health', 'CPI_Transport', 
              'CPI_Communication', 'CPI_Education']

In [15]:
df[predictors].describe()

Unnamed: 0,Crude Oil Price,Production,Crude Oil Export,CPI_Food,CPI_Energy,CPI_Health,CPI_Transport,CPI_Communication,CPI_Education
count,197.0,197.0,197.0,198.0,198.0,198.0,198.0,198.0,198.0
mean,79.886599,1.921015,1.471015,276.715714,234.199688,208.020379,228.683823,144.088741,206.733061
std,26.58638,0.412341,0.412341,200.000754,122.478906,107.971671,130.467048,42.279734,108.070293
min,14.28,0.94,0.49,75.154185,83.251516,83.365891,84.376304,81.893471,73.664248
25%,58.46,1.65,1.2,129.971845,133.807834,124.53866,130.159867,109.843668,117.544356
50%,77.54,2.04,1.59,196.133375,206.319517,171.976085,186.00189,131.918626,170.853864
75%,106.0,2.21,1.76,357.349555,303.127288,258.450157,290.534903,169.037119,268.7825
max,138.74,2.88,2.43,943.458769,596.496022,524.747711,620.966251,236.061986,498.242239


In [16]:
df[predictors].isnull().sum()

Crude Oil Price      1
Production           1
Crude Oil Export     1
CPI_Food             0
CPI_Energy           0
CPI_Health           0
CPI_Transport        0
CPI_Communication    0
CPI_Education        0
dtype: int64

##### Noitcing some null values in the 'Crude Oil Price', 'Production', and 'Crude Oil Export'. They are handled below

In [18]:
df[predictors] = df[predictors].interpolate(method='linear')

In [19]:
# checking for nulls again
df[predictors].isnull().sum()

Crude Oil Price      0
Production           0
Crude Oil Export     0
CPI_Food             0
CPI_Energy           0
CPI_Health           0
CPI_Transport        0
CPI_Communication    0
CPI_Education        0
dtype: int64

In [20]:
## exporting my clean dataframe so I can use it elsewhere
df.to_csv("analysis_ready_dataset.csv", index=False)