## Clean & Transform (Python/Pandas)

Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/child-mortality-gdp-per-capita.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows to confirm it loaded correctly
df.head()


Unnamed: 0,Entity,Code,Year,Child mortality rate,GDP per capita,900793-annotations,Population (historical),World regions according to OWID
0,Afghanistan,AFG,1957,37.13,1253.0,,8588340.0,
1,Afghanistan,AFG,1958,36.52,1298.0,,8723412.0,
2,Afghanistan,AFG,1959,35.95,1307.0,,8869270.0,
3,Afghanistan,AFG,1960,35.32,1326.0,,9035048.0,
4,Afghanistan,AFG,1961,34.76,1309.0,,9214082.0,


In [None]:
# 1. Handle Missing Values
# Drop rows where critical variables (CMR or GDP) are missing
df = df.dropna(subset=['Child mortality rate', 'GDP per capita'])

In [None]:
# 2. Fix the Columns Names
# Let's shorten them so they are easier to type
df = df.rename(columns={
    'Child mortality rate': 'CMR',
    'GDP per capita': 'GDP',
    'Population (historical)': 'Population',
    'World regions according to OWID': 'Region'
})

print('Column names fixed')

Column names fixed


In [None]:
# 3. Handle the "Continent vs Country" issue
# Keep only rows that have a 3-letter Code (these are the actual countries)
# This automatically removes 'Africa', 'Europe', etc., from the Entity column
df_countries = df[df['Code'].notna()].copy()

In [None]:
# 4. Remove "Impossible" values (Negatives or weird 3-digit outliers if they are errors)
# We assume CMR should be between 0.1 and 500 (standard range)
# And GDP must be positive
df_clean = df_countries[
    (df_countries['CMR'] > 0) &
    (df_countries['CMR'] < 500) &
    (df_countries['GDP'] > 0)
]

In [None]:
# 5. Save the data
df_clean.to_csv('child_mortality_cleaned.csv', index=False)

print(f"Cleaning Done! Remaining rows: {len(df_clean)}")

Cleaning Done! Remaining rows: 12672
