In [23]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sqlalchemy import create_engine
from sqlalchemy import text
from sqlalchemy.types import Date, Float
import os
from datetime import datetime


In [24]:
# Importing data from the International Monetary Fund, World Economic Outlook. We are looking to import the "Gross domestic product (GDP), Constant prices, Percent change" for 1980 Onward, this data is just in one row

imf_data = pd.read_csv('International Monetary Fund World Economic Outlook 1980 Onward.csv', header=None)

imf_data.head(5)

  imf_data = pd.read_csv('International Monetary Fund World Economic Outlook 1980 Onward.csv', header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,106,107,108,109,110,111,112,113,114,115
0,DATASET,SERIES_CODE,OBS_MEASURE,COUNTRY,INDICATOR,FREQUENCY,SCALE,DECIMALS_DISPLAYED,FUNCTIONAL_CAT,INT_ACC_ITEM,...,2021.0,2022.0,2023.0,2024.0,2025.0,2026.0,2027.0,2028.0,2029.0,2030.0
1,IMF.RES:WEO(9.0.0),LIE.LE.A,OBS_VALUE,"Liechtenstein, Principality of","Employed persons, Persons for countries / Inde...",Annual,Millions,Three,,,...,0.041,0.043,0.043,0.044,0.045,0.046,,,,
2,IMF.RES:WEO(9.0.0),CAN.LE.A,OBS_VALUE,Canada,"Employed persons, Persons for countries / Inde...",Annual,Millions,Three,,,...,18.973,19.748,20.341,20.723,21.009,21.08,,,,
3,IMF.RES:WEO(9.0.0),HND.NGDPPC.A,OBS_VALUE,Honduras,"Gross domestic product (GDP), Current prices, ...",Annual,Units,Three,,,...,66813.898,74749.009,80686.569,86300.122,95066.366,100495.041,106338.384,112815.742,119722.508,126957.554
4,IMF.RES:WEO(9.0.0),HND.LP.A,OBS_VALUE,Honduras,"Population, Persons for countries / Index for ...",Annual,Millions,Three,,,...,10.117,10.294,10.475,10.659,10.846,11.037,11.231,11.428,11.629,11.833


In [25]:
# Precleaning the data before we filter it for the row that we want We first need to make row 1 the headers

new_header = imf_data.iloc[0].values
imf_data.columns = new_header

# Now that we made row 1 as headers we need to drop row 1 and reset the index
imf_data = imf_data[1:]

imf_data = imf_data.reset_index(drop=True)

imf_data.head()

Unnamed: 0,DATASET,SERIES_CODE,OBS_MEASURE,COUNTRY,INDICATOR,FREQUENCY,SCALE,DECIMALS_DISPLAYED,FUNCTIONAL_CAT,INT_ACC_ITEM,...,2021.0,2022.0,2023.0,2024.0,2025.0,2026.0,2027.0,2028.0,2029.0,2030.0
0,IMF.RES:WEO(9.0.0),LIE.LE.A,OBS_VALUE,"Liechtenstein, Principality of","Employed persons, Persons for countries / Inde...",Annual,Millions,Three,,,...,0.041,0.043,0.043,0.044,0.045,0.046,,,,
1,IMF.RES:WEO(9.0.0),CAN.LE.A,OBS_VALUE,Canada,"Employed persons, Persons for countries / Inde...",Annual,Millions,Three,,,...,18.973,19.748,20.341,20.723,21.009,21.08,,,,
2,IMF.RES:WEO(9.0.0),HND.NGDPPC.A,OBS_VALUE,Honduras,"Gross domestic product (GDP), Current prices, ...",Annual,Units,Three,,,...,66813.898,74749.009,80686.569,86300.122,95066.366,100495.041,106338.384,112815.742,119722.508,126957.554
3,IMF.RES:WEO(9.0.0),HND.LP.A,OBS_VALUE,Honduras,"Population, Persons for countries / Index for ...",Annual,Millions,Three,,,...,10.117,10.294,10.475,10.659,10.846,11.037,11.231,11.428,11.629,11.833
4,IMF.RES:WEO(9.0.0),BHS.LP.A,OBS_VALUE,"Bahamas, The","Population, Persons for countries / Index for ...",Annual,Millions,Three,,,...,0.394,0.399,0.404,0.408,0.413,0.417,0.421,0.425,0.429,0.434


In [26]:
# Filter the data for the correct row, filtering by when COUNTRY = USA and Indicator = Gross domestic product (GDP), Constant prices, Percent change

usa_gdp_data = imf_data[(imf_data['COUNTRY'] == 'United States') & (imf_data['INDICATOR'] == 'Gross domestic product (GDP), Constant prices, Percent change')].copy()

usa_gdp_data.shape

# Great, we have extracted the row containing the data for USA, now we will transpose and keep only the relevant information

(1, 116)

In [27]:
usa_transposed = np.transpose(usa_gdp_data)

usa_transposed.head(100)

Unnamed: 0,240
DATASET,IMF.RES:WEO(9.0.0)
SERIES_CODE,USA.NGDP_RPCH.A
OBS_MEASURE,OBS_VALUE
COUNTRY,United States
INDICATOR,"Gross domestic product (GDP), Constant prices,..."
...,...
2010.0,2.695
2011.0,1.564
2012.0,2.289
2013.0,2.118


In [28]:
# Rename the column headers
print(usa_transposed.columns)

# Determining what the old columns are called

usa_transposed = usa_transposed.reset_index()


usa_transposed.columns = ['date', 'gdp_pct_change_annual']


usa_transposed.head(30)
usa_transposed.tail(5)

Index([240], dtype='int64')


Unnamed: 0,date,gdp_pct_change_annual
111,2026.0,2.102
112,2027.0,2.055
113,2028.0,2.093
114,2029.0,1.881
115,2030.0,1.753


In [29]:
# Filter the results to contain only the relevant information, then drop the ".0 after the year", then convert to standard date format

usa_filtered = usa_transposed.copy()

# Convert 'date' to numeric assign to temporary column that we will drop later
usa_filtered['temp_numeric'] = pd.to_numeric(usa_filtered['date'], errors='coerce')

# Filter: Keep rows where 'temp_numeric' is not NaN (valid years) and > 1980.0
usa_filtered = usa_filtered[(usa_filtered['temp_numeric'].notna()) & (usa_filtered['temp_numeric'] >=1980.0) & (usa_filtered['temp_numeric'] < 2026.0)]

# Clean 'date': Use the numeric version, drop decimal by converting to int (e.g., 1981.0 -> 1981)
usa_filtered['date'] = usa_filtered['temp_numeric'].astype(int)

# Drop the temporary column
usa_filtered = usa_filtered.drop('temp_numeric', axis=1)

# Ensure 'GDP % Change' is numeric (float) for analysis
usa_filtered['gdp_pct_change_annual'] = pd.to_numeric(usa_filtered['gdp_pct_change_annual'], errors='coerce')

# Reset index 
usa_filtered = usa_filtered.reset_index(drop=True)



print(usa_filtered)  
print(usa_filtered.shape)     # (50, 2)

    date  gdp_pct_change_annual
0   1980                 -0.257
1   1981                  2.537
2   1982                 -1.803
3   1983                  4.584
4   1984                  7.236
5   1985                  4.169
6   1986                  3.463
7   1987                  3.455
8   1988                  4.177
9   1989                  3.672
10  1990                  1.886
11  1991                 -0.109
12  1992                  3.522
13  1993                  2.752
14  1994                  4.029
15  1995                  2.685
16  1996                  3.773
17  1997                  4.447
18  1998                  4.483
19  1999                  4.788
20  2000                  4.078
21  2001                  0.956
22  2002                  1.700
23  2003                  2.796
24  2004                  3.848
25  2005                  3.483
26  2006                  2.785
27  2007                  2.004
28  2008                  0.114
29  2009                 -2.576
30  2010

In [30]:
usa_filtered['date'] = pd.to_datetime(usa_filtered['date'].astype(str) + '-01-01')

usa_filtered

Unnamed: 0,date,gdp_pct_change_annual
0,1980-01-01,-0.257
1,1981-01-01,2.537
2,1982-01-01,-1.803
3,1983-01-01,4.584
4,1984-01-01,7.236
5,1985-01-01,4.169
6,1986-01-01,3.463
7,1987-01-01,3.455
8,1988-01-01,4.177
9,1989-01-01,3.672


In [31]:
# Now looking at the cleaned and prepared dataframe, we realized we need quarterly data in order to later create our dependent variable for our analysis, we are going to use some data from FRED to supplement our annual, average GDP data

fred_gdp = pd.read_csv('FRED GDP Percent Change Quarterly 1947 Onward.csv')

fred_gdp = fred_gdp.iloc[:, :2]

# Rename columns for merge

fred_gdp.columns = ['date', 'gdp_pct_change_quarterly']

# Convert the date to our common format

fred_gdp['date'] = pd.to_datetime(fred_gdp['date'])

print(fred_gdp.head())
print(fred_gdp.shape)

        date  gdp_pct_change_quarterly
0 1948-01-01                       2.6
1 1948-04-01                       4.6
2 1948-07-01                       5.4
3 1948-10-01                       3.9
4 1949-01-01                       0.9
(310, 2)


In [32]:
#now we will merge our IMF and Fred data, keeping all the Fred data  


final_gdp = pd.merge(fred_gdp, usa_filtered, on='date', how='left')

final_gdp = final_gdp.fillna(0)

print(final_gdp.tail(30))

          date  gdp_pct_change_quarterly  gdp_pct_change_annual
280 2018-01-01                       3.3                  2.967
281 2018-04-01                       3.3                  0.000
282 2018-07-01                       3.1                  0.000
283 2018-10-01                       2.1                  0.000
284 2019-01-01                       1.9                  2.584
285 2019-04-01                       2.2                  0.000
286 2019-07-01                       2.8                  0.000
287 2019-10-01                       3.4                  0.000
288 2020-01-01                       1.4                 -2.081
289 2020-04-01                      -7.4                  0.000
290 2020-07-01                      -1.4                  0.000
291 2020-10-01                      -0.9                  0.000
292 2021-01-01                       1.8                  6.152
293 2021-04-01                      12.4                  0.000
294 2021-07-01                       5.2

In [33]:
final_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      310 non-null    datetime64[ns]
 1   gdp_pct_change_quarterly  310 non-null    float64       
 2   gdp_pct_change_annual     310 non-null    float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 7.4 KB


In [34]:
#Create new table in database to hold the dataframe


host = "anly-615-project-anlyproject.g.aivencloud.com"
port = 23263
user = "avnadmin"
password = "AVNS_uZtAlXsQZVgdnkwXesP"
database = "defaultdb"

connection_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}"
engine = create_engine(connection_string, connect_args={"ssl": {"ssl_mode": "REQUIRED"}}, echo=False)

with engine.connect() as conn:
    # Drop the table if it exists AKA overwrite an older one
    conn.execute(text("DROP TABLE IF EXISTS usa_gdp_data"))
    
    # Create the table with appropriate schema
    # 'date' as DATE (primary key)
    conn.execute(text("""
        CREATE TABLE usa_gdp_data (
            date DATE PRIMARY KEY,
            gdp_pct_change_quarterly DECIMAL(10,6),
            gdp_pct_change_annual DECIMAL(10,6)
        ) ENGINE=InnoDB
    """))
    
    conn.commit()

In [35]:
#final_gdp.to_csv("final_gdp1.csv", header=False) # Checking output to ensure formatting is correct

#looks good, I included the index which will be omitted when uploaded to the database

print(final_gdp.columns.tolist())
print(final_gdp.shape)
print(final_gdp.head())


['date', 'gdp_pct_change_quarterly', 'gdp_pct_change_annual']
(310, 3)
        date  gdp_pct_change_quarterly  gdp_pct_change_annual
0 1948-01-01                       2.6                    0.0
1 1948-04-01                       4.6                    0.0
2 1948-07-01                       5.4                    0.0
3 1948-10-01                       3.9                    0.0
4 1949-01-01                       0.9                    0.0


In [36]:
# Insert the DataFrame into the table
final_gdp.to_sql(
    'usa_gdp_data',
    con=engine,
    if_exists='append',  # 'replace' if recreating table
    index=False,
    dtype={
        'date': Date,
        'gdp_pct_change_quarterly': Float,
        'gdp_pct_change_annual': Float
    }
)
print("Table created and data inserted successfully.")

Table created and data inserted successfully.
