In [1]:
# Imports
import pandas as pd

# Paths
INPUT_PATH = "../data/happiness_messy.csv"
OUTPUT_PATH = "../data/happiness_cleaned.csv"

# Load data
df = pd.read_csv(INPUT_PATH)

# Check total rows and columns
print("DataFrame shape:", df.shape)

# Quick look
df.head(10)

DataFrame shape: (10, 5)


Unnamed: 0,Country,Year,GDP_per_capita,Happiness_score,Life_expectancy
0,United Kingdom,2021,42300.0,7.22,81.2
1,UK,2021,42300.0,7.22,81.2
2,Greece,2021,18100.0,6.0,82.0
3,Greece,2021,18100.0,6.0,82.0
4,Spain,2021,,6.4,83.1
5,France,2021,39000.0,6.7,
6,Germany,2021,-45000.0,7.0,81.0
7,Italy,2021,35000.0,6.5,83.4
8,Portugal,2021,27500.0,,82.1
9,Sweden,2021,53000.0,7.36,82.8


In [2]:
# Standardize country names
df["Country"] = df["Country"].replace({
    "UK": "United Kingdom",
    "U.K.": "United Kingdom"
    # Add more replacements if needed
})


In [3]:
# Remove commas and convert to numeric
df["GDP_per_capita"] = (
    df["GDP_per_capita"]
    .astype(str)
    .str.replace(",", "")
)

df["GDP_per_capita"] = pd.to_numeric(df["GDP_per_capita"], errors="coerce")

# Fill missing GDP with median (judgment call)
median_gdp = df["GDP_per_capita"].median()
df["GDP_per_capita"] = df["GDP_per_capita"].fillna(median_gdp)

# Replace negative GDPs with median
df.loc[df["GDP_per_capita"] < 0, "GDP_per_capita"] = median_gdp



In [4]:
# Remove duplicate rows
original_rows = len(df)
df = df.drop_duplicates()
print(f"Removed {original_rows - len(df)} duplicate rows")


Removed 2 duplicate rows


In [5]:
# Drop rows where Happiness score or Life expectancy is missing
df = df.dropna(subset=["Happiness_score", "Life_expectancy"])

# Optional: check how many rows remain
print(f"Rows remaining after cleaning: {len(df)}")


Rows remaining after cleaning: 6


In [6]:
# Quick stats to verify data makes sense
print(df.describe())

# Optional: check unique countries
print("Number of countries:", df["Country"].nunique())
print("Countries in dataset:", df["Country"].unique())

df.head(10)


         Year  GDP_per_capita  Happiness_score  Life_expectancy
count     6.0        6.000000         6.000000         6.000000
mean   2021.0    36400.000000         6.746667        82.250000
std       0.0    11411.748332         0.529704         1.007472
min    2021.0    18100.000000         6.000000        81.000000
25%    2021.0    35000.000000         6.425000        81.400000
50%    2021.0    35000.000000         6.750000        82.400000
75%    2021.0    40475.000000         7.165000        83.025000
max    2021.0    53000.000000         7.360000        83.400000
Number of countries: 6
Countries in dataset: <StringArray>
['United Kingdom', 'Greece', 'Spain', 'Germany', 'Italy', 'Sweden']
Length: 6, dtype: str


Unnamed: 0,Country,Year,GDP_per_capita,Happiness_score,Life_expectancy
0,United Kingdom,2021,42300.0,7.22,81.2
2,Greece,2021,18100.0,6.0,82.0
4,Spain,2021,35000.0,6.4,83.1
6,Germany,2021,35000.0,7.0,81.0
7,Italy,2021,35000.0,6.5,83.4
9,Sweden,2021,53000.0,7.36,82.8
