In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
import statsmodels.api as sm
import db_config 

from sqlalchemy import create_engine
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Creating database connection
engine = create_engine(db_config.DB_URL)

In [11]:
# Fetch data
query = "SELECT * FROM cpi;"
cpi = pd.read_sql(query, con=engine)
print(cpi.head())

# Close connection
engine.dispose()

   year     month     cpi  food cpi  exchange rate\n (kes/usd)  \
0  2025  February  143.12     118.0                     128.70   
1  2025   January  142.68     116.1                     129.00   
2  2024  December  141.66     120.4                     128.85   
3  2024  November  140.81     120.7                     129.50   
4  2024   October  140.44     119.9                     128.50   

   interest rate (%)  
0              10.75  
1              11.25  
2              11.25  
3              12.00  
4              12.00  


In [12]:
# Fetch data
query = "SELECT * FROM gdpgrowth;"
gdpgrowth = pd.read_sql(query, con=engine)
print(gdpgrowth.head())

# Close connection
engine.dispose()

   year quarter  gdp growth rate (%)  gdp value (kes billion)
0  2025      Q1                  NaN                      NaN
1  2025      Q2                  NaN                      NaN
2  2025      Q3                  NaN                      NaN
3  2025      Q4                  NaN                      NaN
4  2024      Q1                  5.3                  14000.0


In [13]:
# Fetch data
query = "SELECT * FROM taxrates;"
taxrates = pd.read_sql(query, con=engine)
print(taxrates.head())

# Close connection
engine.dispose()

   year quarter  vat rate (%) paye bracket (kes)  tax rate (%)
0  2025      Q1            16         0 – 24,000          10.0
1  2025      Q1            16    24,001 – 32,333          15.0
2  2025      Q1            16   32,334 – 500,000          25.0
3  2025      Q1            16  500,001 – 800,000          30.0
4  2025      Q1            16      Above 800,000          35.0


In [14]:
# Fetch data
query = "SELECT * FROM unemployment;"
unemployment = pd.read_sql(query, con=engine)
print(unemployment.head())

# Close connection
engine.dispose()

   year unemployment rate (%)
0  2025                  None
1  2024                  None
2  2023                 5.68%
3  2022                 5.81%
4  2021                 5.69%


The database has been successfully connected and the tables have been loaded.

**1.cpi**

In [15]:
# Summary statistics for numerical columns in cpi
cpi.describe()

Unnamed: 0,year,cpi,food cpi,exchange rate\n (kes/usd),interest rate (%)
count,122.0,122.0,122.0,122.0,122.0
mean,2019.590164,153.417828,108.52377,111.874918,9.389344
std,2.945101,31.533722,16.077175,14.948723,1.777107
min,2015.0,107.174,88.9,91.3,7.0
25%,2017.0,125.8675,95.9,101.725,8.25
50%,2020.0,148.275,101.85,104.325,9.25
75%,2022.0,183.4625,118.45,119.775,10.5
max,2025.0,205.9,156.8,160.0,13.0


In [18]:
categorical_summary = {
    'month': cpi['month'].value_counts()
}

print("\nCategorical Summary Statistics:")
print(categorical_summary)


Categorical Summary Statistics:
{'month': month
February     11
January      11
December     10
November     10
October      10
September    10
August       10
July         10
June         10
May          10
April        10
March        10
Name: count, dtype: int64}


In [28]:
# Data types and non-null counts in cpi dataset
cpi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   year                      122 non-null    int64  
 1   month                     122 non-null    object 
 2   cpi                       122 non-null    float64
 3   food cpi                  122 non-null    float64
 4   exchange rate
 (kes/usd)  122 non-null    float64
 5   interest rate (%)         122 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 5.8+ KB


**2.gdpgrowth**

In [19]:
# Summary statistics for numerical columns in gdpgrowth
gdpgrowth.describe()

Unnamed: 0,year,gdp growth rate (%),gdp value (kes billion)
count,44.0,40.0,40.0
mean,2020.0,4.785,11483.75
std,3.198837,2.12501,1624.152146
min,2015.0,-5.7,8600.0
25%,2017.0,4.875,10212.5
50%,2020.0,5.3,11400.0
75%,2023.0,5.7,12937.5
max,2025.0,6.2,14000.0


In [20]:
categorical_summary = {
    'quarter': gdpgrowth['quarter'].value_counts()
}

print("\nCategorical Summary Statistics:")
print(categorical_summary)


Categorical Summary Statistics:
{'quarter': quarter
Q1    11
Q2    11
Q3    11
Q4    11
Name: count, dtype: int64}


In [27]:
# Data types and non-null counts in gdpgrowth dataset
gdpgrowth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     44 non-null     int64  
 1   quarter                  44 non-null     object 
 2   gdp growth rate (%)      40 non-null     float64
 3   gdp value (kes billion)  40 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.5+ KB


**3.taxrates**

In [21]:
# Summary statistics for numerical columns in taxrates
taxrates.describe()

Unnamed: 0,year,vat rate (%),tax rate (%)
count,220.0,220.0,220.0
mean,2020.0,15.863636,24.681818
std,3.169489,0.505265,9.315065
min,2015.0,14.0,10.0
25%,2017.0,16.0,15.0
50%,2020.0,16.0,30.0
75%,2023.0,16.0,32.5
max,2025.0,16.0,35.0


In [22]:
categorical_summary = {
    'quarter': taxrates['quarter'].value_counts(),
    'paye bracket (kes)': taxrates['paye bracket (kes)'].value_counts()
}

print("\nCategorical Summary Statistics:")
print(categorical_summary)


Categorical Summary Statistics:
{'quarter': quarter
Q1    55
Q2    55
Q3    55
Q4    55
Name: count, dtype: int64, 'paye bracket (kes)': paye bracket (kes)
0 – 24,000           44
24,001 – 32,333      44
32,334 – 500,000     44
500,001 – 800,000    44
Above 800,000        44
Name: count, dtype: int64}


In [26]:
# Data types and non-null counts in taxrates dataset
taxrates.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                220 non-null    int64  
 1   quarter             220 non-null    object 
 2   vat rate (%)        220 non-null    int64  
 3   paye bracket (kes)  220 non-null    object 
 4   tax rate (%)        220 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 8.7+ KB


**4.unemployment**

In [23]:
# Summary statistics for numerical columns in unemployment
unemployment.describe()

Unnamed: 0,year
count,11.0
mean,2020.0
std,3.316625
min,2015.0
25%,2017.5
50%,2020.0
75%,2022.5
max,2025.0


In [24]:
categorical_summary = {
    'unemployment rate (%)': unemployment['unemployment rate (%)'].value_counts()
}

print("\nCategorical Summary Statistics:")
print(categorical_summary)


Categorical Summary Statistics:
{'unemployment rate (%)': unemployment rate (%)
2.76%    2
5.68%    1
5.69%    1
5.81%    1
5.62%    1
5.01%    1
4.28%    1
3.54%    1
Name: count, dtype: int64}


In [25]:
# Data types and non-null counts in unemployment dataset
unemployment.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   year                   11 non-null     int64 
 1   unemployment rate (%)  9 non-null      object
dtypes: int64(1), object(1)
memory usage: 308.0+ bytes


# Exploratory Data Analysis(EDA)