### Creating a virtual environment with every required libraries installed in it.
python -m venv .venv

#### Activating it
.venv/Scripts/Activate

# Cross-Country Analysis of Economic Growth Trends (1970–2024)

## Phase 1: Preprocessing & Feature Engineering

### Objectives:
- Clean GDP dataset
- Handle missing values
- Restructure dataset for time-series analysis
- Prepare features for growth rate computation

#### Before performing manipulations in Python, I deleted the first two rows from the dataset because they were not needed.

## Importing Libraries

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler 

## Loading the Dataset

In [23]:
# Load dataset
file_path = r"C:\Projects\DAP\gdp-growth-analysis\data\Download-GDPcurrent-USD-countries.xlsx"
df_raw = pd.read_excel(file_path)

# Create working copy
df = df_raw.copy()

df.head()

Unnamed: 0,CountryID,Country,IndicatorName,1970,1971,1972,1973,1974,1975,1976,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,4,Afghanistan,Final consumption expenditure,1691109000.0,1826664000.0,1544444000.0,1637778000.0,1984444000.0,2173333000.0,2248889000.0,...,19054340000.0,18415520000.0,18595870000.0,20827030000.0,21380900000.0,22863090000.0,16637700000.0,16933010000.0,20093090000.0,27087020000.0
1,4,Afghanistan,Household consumption expenditure (including N...,1577102000.0,1703519000.0,1440325000.0,1527366000.0,1850662000.0,2026817000.0,2097279000.0,...,14969060000.0,14295840000.0,14607870000.0,16883260000.0,17219760000.0,18445000000.0,13414130000.0,13836420000.0,16626690000.0,22128820000.0
2,4,Afghanistan,General government final consumption expenditure,114007100.0,123145600.0,104119600.0,110411700.0,133782400.0,146516500.0,151610100.0,...,4085273000.0,4119686000.0,3987993000.0,3943771000.0,4161138000.0,4418088000.0,3223572000.0,3096592000.0,3466403000.0,4958199000.0
3,4,Afghanistan,Gross capital formation,95555560.0,99999980.0,104444500.0,126666600.0,188888900.0,231111100.0,348888900.0,...,2711463000.0,2331712000.0,2640511000.0,2689519000.0,2434404000.0,2289180000.0,1968903000.0,2362752000.0,2490757000.0,2041052000.0
4,4,Afghanistan,Gross fixed capital formation (including Acqui...,95555560.0,99999980.0,104444500.0,126666600.0,188888900.0,231111100.0,348888900.0,...,2711463000.0,2331712000.0,2640511000.0,2689519000.0,2434404000.0,2289180000.0,1968903000.0,2362752000.0,2490757000.0,2041052000.0


## Data Overview & Structure Inspection

In [24]:
print("Shape of dataset:", df.shape)
print("\nColumn Names:")
print(df.columns)

print("\nDataset Info:")
df.info()

print("\nMissing Values per Column:")
print(df.isna().sum())

Shape of dataset: (3715, 58)

Column Names:
Index([    'CountryID',       'Country', 'IndicatorName',            1970,
                  1971,            1972,            1973,            1974,
                  1975,            1976,            1977,            1978,
                  1979,            1980,            1981,            1982,
                  1983,            1984,            1985,            1986,
                  1987,            1988,            1989,            1990,
                  1991,            1992,            1993,            1994,
                  1995,            1996,            1997,            1998,
                  1999,            2000,            2001,            2002,
                  2003,            2004,            2005,            2006,
                  2007,            2008,            2009,            2010,
                  2011,            2012,            2013,            2014,
                  2015,            2016,            2017

In [25]:
print("Unique Indicators:")
print(df["IndicatorName"].unique())

Unique Indicators:
['Final consumption expenditure'
 'Household consumption expenditure (including Non-profit institutions serving households)'
 'General government final consumption expenditure'
 'Gross capital formation'
 'Gross fixed capital formation (including Acquisitions less disposals of valuables)'
 'Exports of goods and services' 'Imports of goods and services'
 'Gross Domestic Product (GDP)'
 'Agriculture, hunting, forestry, fishing (ISIC A-B)'
 'Mining, Manufacturing, Utilities (ISIC C-E)' 'Manufacturing (ISIC D)'
 'Construction (ISIC F)'
 'Wholesale, retail trade, restaurants and hotels (ISIC G-H)'
 'Transport, storage and communication (ISIC I)'
 'Other Activities (ISIC J-P)' 'Total Value Added'
 'Changes in inventories']


In [26]:
df_gdp = df[df["IndicatorName"].str.contains("GDP", case=False)]

df_gdp.head()

Unnamed: 0,CountryID,Country,IndicatorName,1970,1971,1972,1973,1974,1975,1976,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
7,4,Afghanistan,Gross Domestic Product (GDP),1748887000.0,1831109000.0,1595555000.0,1733333000.0,2155556000.0,2366667000.0,2555556000.0,...,18699570000.0,18224350000.0,19034300000.0,18856320000.0,19372360000.0,19983010000.0,15160920000.0,14174990000.0,16339760000.0,18283160000.0
24,8,Albania,Gross Domestic Product (GDP),2265577000.0,2330585000.0,2397526000.0,2466543000.0,2537016000.0,2610110000.0,2685746000.0,...,11470170000.0,11988670000.0,13258240000.0,15379510000.0,15585110000.0,15241460000.0,18031990000.0,19017250000.0,23491240000.0,27037470000.0
41,12,Algeria,Gross Domestic Product (GDP),5155121000.0,5363676000.0,7176428000.0,9229622000.0,13259760000.0,15555550000.0,17750030000.0,...,187493900000.0,180763800000.0,189880900000.0,194554500000.0,193459700000.0,164873400000.0,186231700000.0,225638500000.0,247626200000.0,266972300000.0
58,20,Andorra,Gross Domestic Product (GDP),99476790.0,113128500.0,143506000.0,190863800.0,236055500.0,278514300.0,287588400.0,...,2789880000.0,2896613000.0,3000160000.0,3218419000.0,3155152000.0,2890998000.0,3324642000.0,3380613000.0,3785063000.0,4039838000.0
75,24,Angola,Gross Domestic Product (GDP),3806982000.0,4007445000.0,4102155000.0,5016276000.0,5626961000.0,4147433000.0,3980907000.0,...,131661000000.0,114763700000.0,139834600000.0,114189200000.0,94671230000.0,66520650000.0,84375110000.0,142402900000.0,114335800000.0,117159400000.0


In [27]:
print("Shape after GDP filter:", df_gdp.shape)

Shape after GDP filter: (220, 58)


In [30]:
df_gdp.dtypes

CountryID          int64
Country           object
IndicatorName     object
1970             float64
1971             float64
1972             float64
1973             float64
1974             float64
1975             float64
1976             float64
1977             float64
1978             float64
1979             float64
1980             float64
1981             float64
1982             float64
1983             float64
1984             float64
1985             float64
1986             float64
1987             float64
1988             float64
1989             float64
1990             float64
1991             float64
1992             float64
1993             float64
1994             float64
1995             float64
1996             float64
1997             float64
1998             float64
1999             float64
2000             float64
2001             float64
2002             float64
2003             float64
2004             float64
2005             float64
2006             float64


In [31]:
df_gdp.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 220 entries, 7 to 3706
Data columns (total 58 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CountryID      220 non-null    int64  
 1   Country        220 non-null    object 
 2   IndicatorName  220 non-null    object 
 3   1970           187 non-null    float64
 4   1971           187 non-null    float64
 5   1972           187 non-null    float64
 6   1973           187 non-null    float64
 7   1974           187 non-null    float64
 8   1975           187 non-null    float64
 9   1976           187 non-null    float64
 10  1977           187 non-null    float64
 11  1978           187 non-null    float64
 12  1979           187 non-null    float64
 13  1980           187 non-null    float64
 14  1981           187 non-null    float64
 15  1982           187 non-null    float64
 16  1983           187 non-null    float64
 17  1984           187 non-null    float64
 18  1985          

In [32]:
year_columns = [col for col in df_gdp.columns if str(col).isdigit()]

df_gdp[year_columns].describe()

Unnamed: 0,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
count,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,...,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0
mean,18297340000.0,20137640000.0,23171090000.0,28190600000.0,31932880000.0,35713910000.0,38587860000.0,43448570000.0,51474090000.0,59234820000.0,...,356258500000.0,362130000000.0,385349600000.0,409234300000.0,415145400000.0,404419800000.0,462023100000.0,480742300000.0,502177600000.0,522491200000.0
std,88660190000.0,96287080000.0,107579200000.0,123908600000.0,133675200000.0,147019900000.0,160623500000.0,179653300000.0,209804500000.0,234463500000.0,...,1555933000000.0,1597216000000.0,1685309000000.0,1806712000000.0,1868176000000.0,1868927000000.0,2132042000000.0,2254208000000.0,2356668000000.0,2460942000000.0
min,2585174.0,2747866.0,3028236.0,3780359.0,4013522.0,3847741.0,3797642.0,3546258.0,3853304.0,3998563.0,...,36193810.0,41306120.0,45288880.0,48168920.0,53789570.0,52469410.0,61457320.0,54187110.0,50548760.0,56436020.0
25%,219402100.0,244536500.0,285073500.0,361435100.0,409789200.0,492847300.0,530589800.0,581945400.0,669658500.0,759342300.0,...,6064012000.0,5973133000.0,5843874000.0,6326665000.0,6369348000.0,6217613000.0,7079508000.0,7059586000.0,7542800000.0,8480262000.0
50%,1067267000.0,1213650000.0,1300600000.0,1605731000.0,2127503000.0,2398801000.0,2680976000.0,2791325000.0,3098138000.0,3697940000.0,...,23375040000.0,23907880000.0,25466830000.0,26674620000.0,27596730000.0,25417550000.0,30930010000.0,31797520000.0,34860880000.0,36904060000.0
75%,6017784000.0,6933917000.0,8145223000.0,9409526000.0,12386240000.0,13152090000.0,14260800000.0,16027310000.0,18349570000.0,22380190000.0,...,177969900000.0,170142800000.0,187883500000.0,196697600000.0,184615300000.0,166425700000.0,197775700000.0,225531900000.0,244530300000.0,253348900000.0
max,1073303000000.0,1164850000000.0,1279110000000.0,1425376000000.0,1545243000000.0,1684904000000.0,1873412000000.0,2081826000000.0,2351599000000.0,2627333000000.0,...,18295020000000.0,18804910000000.0,19612100000000.0,20656520000000.0,21540000000000.0,21375300000000.0,23725600000000.0,26054600000000.0,27811500000000.0,29298000000000.0


In [33]:
missing_summary = df_gdp[year_columns].isna().sum()
missing_summary.sort_values(ascending=False).head(10)

1970    33
1971    33
1972    33
1973    33
1974    33
1975    33
1976    33
1977    33
1978    33
1979    33
dtype: int64

In [34]:
print("Duplicate countries:", df_gdp["Country"].duplicated().sum())

Duplicate countries: 0
