#### Import Libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

#### Load Data

In [7]:
data = pd.read_csv(r'Main Dataset.csv',skiprows=4)
data.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 69
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54922.0,55578.0,56320.0,57002.0,57619.0,58190.0,...,108727.0,108735.0,108908.0,109203.0,108587.0,107700.0,107310.0,107359.0,,
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130072080.0,133534923.0,137171659.0,140945536.0,144904094.0,149033472.0,...,623369401.0,640058741.0,657801085.0,675950189.0,694446100.0,713090928.0,731821393.0,750503764.0,,
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,9035043.0,9214083.0,9404406.0,9604487.0,9814318.0,10036008.0,...,34700612.0,35688935.0,36743039.0,37856121.0,39068979.0,40000412.0,40578842.0,41454761.0,,
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,97630925.0,99706674.0,101854756.0,104089175.0,106388440.0,108772632.0,...,429454743.0,440882906.0,452195915.0,463365429.0,474569351.0,485920997.0,497387180.0,509398589.0,,
4,Angola,AGO,"Population, total",SP.POP.TOTL,5231654.0,5301583.0,5354310.0,5408320.0,5464187.0,5521981.0,...,29183070.0,30234839.0,31297155.0,32375632.0,33451132.0,34532429.0,35635029.0,36749906.0,,


##### Data Cleaning

In [8]:
print(data.isnull().sum())

Country Name        0
Country Code        0
Indicator Name      0
Indicator Code      0
1960                2
                 ... 
2021                1
2022                1
2023                1
2024              266
Unnamed: 69       266
Length: 70, dtype: int64


📊 Summary of Data:  

Country Name, Country Code, Indicator Name, Indicator Code: ✅ No missing values.

Years 1960 to 2023: Some missing values.

2024 and Unnamed: 69: Mostly missing (266 rows = probably all countries).

In [10]:
# Drop missing values
data = data.drop(columns=['Unnamed: 69'])

In [16]:
data['2024']

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
261   NaN
262   NaN
263   NaN
264   NaN
265   NaN
Name: 2024, Length: 266, dtype: float64

In [17]:
data = data.drop(columns=['2024'])

In [18]:
# interpolate missing values across years 
# This will fill gaps using linear interpolation based on neighboring years (e.g., missing 1985 will be averaged between 1984 and 1986).

year_cols = [str(year) for year in range(1960, 2023)]
data[year_cols] = data[year_cols].interpolate( axis=1)


In [19]:
data.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54922.0,55578.0,56320.0,57002.0,57619.0,58190.0,...,106807.0,107906.0,108727.0,108735.0,108908.0,109203.0,108587.0,107700.0,107310.0,107359.0
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130072080.0,133534923.0,137171659.0,140945536.0,144904094.0,149033472.0,...,590968990.0,607123269.0,623369401.0,640058741.0,657801085.0,675950189.0,694446100.0,713090928.0,731821393.0,750503764.0
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,9035043.0,9214083.0,9404406.0,9604487.0,9814318.0,10036008.0,...,32792523.0,33831764.0,34700612.0,35688935.0,36743039.0,37856121.0,39068979.0,40000412.0,40578842.0,41454761.0
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,97630925.0,99706674.0,101854756.0,104089175.0,106388440.0,108772632.0,...,406992047.0,418127845.0,429454743.0,440882906.0,452195915.0,463365429.0,474569351.0,485920997.0,497387180.0,509398589.0
4,Angola,AGO,"Population, total",SP.POP.TOTL,5231654.0,5301583.0,5354310.0,5408320.0,5464187.0,5521981.0,...,27160769.0,28157798.0,29183070.0,30234839.0,31297155.0,32375632.0,33451132.0,34532429.0,35635029.0,36749906.0


In [22]:
# check remaining missing values
print(data[year_cols].isnull().sum())

1960    2
1961    2
1962    2
1963    2
1964    2
       ..
2018    1
2019    1
2020    1
2021    1
2022    1
Length: 63, dtype: int64


In [27]:
# Fill any remaining start/end gaps using forward/backward fill
data[year_cols] = data[year_cols].fillna(method='ffill', axis=1)
data[year_cols] = data[year_cols].fillna(method='bfill', axis=1)

  data[year_cols] = data[year_cols].fillna(method='ffill', axis=1)
  data[year_cols] = data[year_cols].fillna(method='bfill', axis=1)


In [29]:
# verify no missing values
print(data[year_cols].isnull().sum().sum())

63


In [32]:
# Find which rows (countries) have missing values

# Filter rows that still contain any NaNs in year columns
missing_rows = data[data[year_cols].isnull().any(axis=1)]
missing_rows

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
110,Not classified,INX,"Population, total",SP.POP.TOTL,,,,,,,...,,,,,,,,,,


In [33]:
# See which countries are affected
print(missing_rows[['Country Name', 'Country Code']])

       Country Name Country Code
110  Not classified          INX


In [None]:
data = data.drop(missing_rows.index)
data

In [38]:
# verify no missing values
print(data[year_cols].isnull().sum().sum())

0


##### Data Visualization