In [15]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.io as pio
pio.renderers.default = "notebook"

# 1. LOAD & CLEAN MENTAL HEALTH (PERCENT ONLY)

mental_raw = pd.read_csv("Mental health Depression disorder Data.csv")

# Columns with disorder prevalence info
disorder_cols = [
    "Schizophrenia (%)",
    "Bipolar disorder (%)",
    "Eating disorders (%)",
    "Anxiety disorders (%)",
    "Drug use disorders (%)",
    "Depression (%)",
    "Alcohol use disorders (%)",
]

# Convert those columns to numeric so we can filter
for c in disorder_cols:
    mental_raw[c] = pd.to_numeric(mental_raw[c], errors="coerce")

# Keep only rows where ALL of these look like actual percentages (<= 100)
mask_pct = (mental_raw[disorder_cols] <= 100).all(axis=1)
mental_pct = mental_raw.loc[mask_pct].copy()

# Rename to clean names
mental = mental_pct.rename(columns={
    "Entity": "country",
    "Code": "country_code",
    "Year": "year",
    "Schizophrenia (%)": "schizophrenia_pct",
    "Bipolar disorder (%)": "bipolar_pct",
    "Eating disorders (%)": "eating_disorders_pct",
    "Anxiety disorders (%)": "anxiety_pct",
    "Drug use disorders (%)": "drug_use_pct",
    "Depression (%)": "depression_pct",
    "Alcohol use disorders (%)": "alcohol_use_pct",
})

# Ensure proper dtypes
mental["year"] = pd.to_numeric(mental["year"], errors="coerce").astype("Int64")

for col in [
    "schizophrenia_pct",
    "bipolar_pct",
    "eating_disorders_pct",
    "anxiety_pct",
    "drug_use_pct",
    "depression_pct",
    "alcohol_use_pct",
]:
    mental[col] = pd.to_numeric(mental[col], errors="coerce")

# Drop rows with missing key info
mental = mental.dropna(subset=["country", "year"]).reset_index(drop=True)

print("Mental health shape:", mental.shape)
print(mental.head(), "\n")


# 2. LOAD & CLEAN WORLD HAPPINESS (2015–2019)

happy_files = ["2015.csv", "2016.csv", "2017.csv", "2018.csv", "2019.csv"]

happy_dfs = []
for f in happy_files:
    tmp = pd.read_csv(f)
    tmp.columns = [c.strip() for c in tmp.columns]

    # Infer year from the filename
    year_match = re.search(r"(20\d{2})", f)
    year = int(year_match.group(1)) if year_match else None
    tmp["year"] = year

    rename_map = {}
    for c in tmp.columns:
        lc = c.lower()

        if lc.startswith("country"):
            rename_map[c] = "country"

        # Happiness score: handles "Happiness Score", "Happiness.Score", and "Score"
        elif (("happiness" in lc and "score" in lc) or lc == "happiness" or lc == "score"):
            rename_map[c] = "happiness_score"

        # GDP per capita: handles old and new column names
        elif "economy" in lc or "gdp per capita" in lc or "gdp.per.capita" in lc:
            rename_map[c] = "gdp_per_capita"

        # Social support / Family
        elif lc.startswith("family") or "social support" in lc:
            rename_map[c] = "social_support"

        # Healthy life expectancy variations
        elif (
            "health (life expectancy" in lc
            or "healthy life expectancy" in lc
            or "life.expectancy" in lc
            or "life expectancy" in lc
        ):
            rename_map[c] = "healthy_life_expectancy"

        # Freedom
        elif lc.startswith("freedom"):
            rename_map[c] = "freedom"

        # Generosity
        elif "generosity" in lc:
            rename_map[c] = "generosity"

        # Corruption / perceptions of corruption
        elif "corruption" in lc or "perceptions of corruption" in lc:
            rename_map[c] = "perceptions_of_corruption"

    tmp = tmp.rename(columns=rename_map)

    target_cols = [
        "country",
        "year",
        "happiness_score",
        "gdp_per_capita",
        "social_support",
        "healthy_life_expectancy",
        "freedom",
        "generosity",
        "perceptions_of_corruption",
    ]
    tmp = tmp[[c for c in target_cols if c in tmp.columns]]

    happy_dfs.append(tmp)

happiness = pd.concat(happy_dfs, ignore_index=True)
happiness = happiness.dropna(subset=["country", "year"]).reset_index(drop=True)
happiness["year"] = happiness["year"].astype(int)

print("Happiness shape:", happiness.shape)
print(happiness.head(), "\n")



# 3. STANDARDIZE COUNTRY NAMES & MERGE


def clean_country_name(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    # drop stuff in parentheses, e.g. "Bolivia (Plurinational State of)"
    s = re.sub(r"\s*\(.*\)$", "", s)
    s = s.replace("&", "and")
    s = s.lower()

    replacements = {
        "united states of america": "united states",
        "united states": "united states",
        "russian federation": "russia",
        "czech republic": "czechia",
        "viet nam": "vietnam",
        "korea, republic of": "south korea",
        "korea, dem. people’s rep.": "north korea",
        "korea, dem. people’s rep": "north korea",
        "iran, islamic republic of": "iran",
        "egypt, arab rep.": "egypt",
    }

    return replacements.get(s, s)

mental["country_clean"] = mental["country"].apply(clean_country_name)
happiness["country_clean"] = happiness["country"].apply(clean_country_name)

mental = mental.dropna(subset=["country_clean"])
happiness = happiness.dropna(subset=["country_clean"])

merged = pd.merge(
    mental,
    happiness,
    on=["country_clean", "year"],
    how="inner",
    suffixes=("_mental", "_happy"),
)


merged["country"] = merged["country_mental"].fillna(merged["country_happy"])

print("Merged shape:", merged.shape)
merged.head()

Mental health shape: (6468, 11)
   index      country country_code  year  schizophrenia_pct  bipolar_pct  \
0      0  Afghanistan          AFG  1990           0.160560     0.697779   
1      1  Afghanistan          AFG  1991           0.160312     0.697961   
2      2  Afghanistan          AFG  1992           0.160135     0.698107   
3      3  Afghanistan          AFG  1993           0.160037     0.698257   
4      4  Afghanistan          AFG  1994           0.160022     0.698469   

   eating_disorders_pct  anxiety_pct  drug_use_pct  depression_pct  \
0              0.101855     4.828830      1.677082        4.071831   
1              0.099313     4.829740      1.684746        4.079531   
2              0.096692     4.831108      1.694334        4.088358   
3              0.094336     4.830864      1.705320        4.096190   
4              0.092439     4.829423      1.716069        4.099582   

   alcohol_use_pct  
0         0.672404  
1         0.671768  
2         0.670644  
3     


Columns (5,6) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,index,country_mental,country_code,year,schizophrenia_pct,bipolar_pct,eating_disorders_pct,anxiety_pct,drug_use_pct,depression_pct,...,country_clean,country_happy,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,generosity,perceptions_of_corruption,country
0,25,Afghanistan,AFG,2015,0.165895,0.707275,0.104445,4.875201,2.543144,4.135419,...,afghanistan,Afghanistan,3.575,0.31982,0.30285,0.30335,0.23414,0.3651,0.09719,Afghanistan
1,26,Afghanistan,AFG,2016,0.166035,0.707686,0.105808,4.878875,2.513553,4.135694,...,afghanistan,Afghanistan,3.36,0.38227,0.11037,0.17344,0.1643,0.31268,0.07112,Afghanistan
2,27,Afghanistan,AFG,2017,0.166158,0.708089,0.107142,4.882481,2.473934,4.136347,...,afghanistan,Afghanistan,3.794,0.401477,0.581543,0.180747,0.10618,0.311871,0.061158,Afghanistan
3,53,Albania,ALB,2015,0.20063,0.704161,0.171095,3.388212,0.513158,2.204454,...,albania,Albania,4.959,0.87867,0.80434,0.81325,0.35733,0.14272,0.06413,Albania
4,54,Albania,ALB,2016,0.200845,0.704335,0.172627,3.386891,0.515036,2.206507,...,albania,Albania,4.655,0.9553,0.50163,0.73007,0.31866,0.1684,0.05301,Albania


In [2]:
merge_2015 = merged[merged["year"] == 2015].reset_index(drop = True)
merge_2016 = merged[merged["year"] == 2016]
merge_2017 = merged[merged["year"] == 2017]

In [10]:
fig15 = px.choropleth(
    merge_2015,
    locations="country_code",
    color="happiness_score",
    color_continuous_scale="Viridis",
    title="Happiness Score by Country 2015",
    locationmode="ISO-3" 
)
fig15.write_image("WorldHappiness2015.png", scale = 3)
fig16 = px.choropleth(
    merge_2016,
    locations="country_code",
    color="happiness_score",
    color_continuous_scale="Viridis",
    title="Happiness Score by Country 2016",
    locationmode="ISO-3" 
)
fig16.write_image("WorldHappiness2016.png", scale = 3)
fig17 = px.choropleth(
    merge_2017,
    locations="country_code",
    color="happiness_score",
    color_continuous_scale="Viridis",
    title="Happiness Score by Country 2017",
    locationmode="ISO-3" 
)
fig17.write_image("WorldHappiness2017.png", scale = 3)

In [29]:
long_mental = merge_2017[["happiness_score", "anxiety_pct", "drug_use_pct", "depression_pct"]].melt(id_vars="happiness_score", var_name="variable", value_name="value")

scatter = px.scatter(long_mental, x="value", y="happiness_score", color="variable",
           title="Mental Health vs Country Happiness Score in 2017")
scatter.data[0].name = "People with Anxiety"
scatter.data[1].name = "People who Suffer with Drug Use"
scatter.data[2].name = "People with Depression"
scatter.update_layout(
    legend_title_text='Mental Health Problem',
    xaxis_title="Percentage",
    yaxis_title="Happiness Score"
)
scatter.write_image("MentalHealthVHappinessScore2017.png", scale = 3)