# Feature Engineering

## Ini yang 3 indikator terpakai (GDP, HDI, Suicide Rate)

In [12]:
import pandas as pd

df_hdi = pd.read_excel("./External-Dataset/HDI.xlsx")
df_hdi.columns = df_hdi.columns.str.strip()
df_hdi = df_hdi[['Country', 'Human Development Index (HDI)']].dropna()
df_hdi = df_hdi.rename(columns={
    'Country': 'Country Name',
    'Human Development Index (HDI)': 'HDI_2023'
})
df_hdi['Country Name'] = df_hdi['Country Name'].str.strip()

df = pd.read_csv(
    "./Dataset/WDICSV.csv",
    engine="python",
    encoding="utf-8-sig",
    on_bad_lines="skip",     # lewati baris rusak
    quotechar='"',           # pastikan tanda kutip ganda dikenali
    sep=",",                 # pastikan delimiter tetap koma
)

wdi_names = set(df['Country Name'].unique())
hdi_names = set(df_hdi['Country Name'].unique())

missing_in_wdi = sorted(list(hdi_names - wdi_names))
print("Negara di HDI tapi tidak ditemukan di WDI:", len(missing_in_wdi))
print(missing_in_wdi[:20])  # tampilkan sebagian

missing_in_hdi = sorted(list(wdi_names - hdi_names))
print("\nNegara di WDI tapi tidak ditemukan di HDI:", len(missing_in_hdi))
print(missing_in_hdi[:20])

Negara di HDI tapi tidak ditemukan di WDI: 195
['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan']

Negara di WDI tapi tidak ditemukan di HDI: 0
[]


In [13]:
# ini mau samain aja nama nama negaranya dari 2 dataset berbeda tadi
rename_map = {
    'Bahamas': 'Bahamas, The',
    'Bolivia (Plurinational State of)': 'Bolivia',
    'Congo': 'Congo, Rep.',
    'Congo (Democratic Republic of the)': 'Congo, Dem. Rep.',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'Egypt': 'Egypt, Arab Rep.',
    'Eswatini (Kingdom of)': 'Eswatini',
    'Gambia': 'Gambia, The',
    'Hong Kong, China (SAR)': 'Hong Kong SAR, China',
    'Iran (Islamic Republic of)': 'Iran, Islamic Rep.',
    "Korea (Democratic People's Rep. of)": 'Korea, Dem. People’s Rep.',
    'Korea (Republic of)': 'Korea, Rep.',
    'Kyrgyzstan': 'Kyrgyz Republic',
    "Lao People's Democratic Republic": 'Lao PDR',
    'Micronesia (Federated States of)': 'Micronesia, Fed. Sts.',
    'Moldova (Republic of)': 'Moldova',
    'Palestine, State of': 'West Bank and Gaza',
    'Saint Kitts and Nevis': 'St. Kitts and Nevis',
    'Saint Lucia': 'St. Lucia',
    'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines'
}

In [14]:
df_hdi['Country Name'] = df_hdi['Country Name'].replace(rename_map)

wdi_names = set(df['Country Name'].unique())
hdi_names = set(df_hdi['Country Name'].unique())

missing_in_wdi = sorted(list(hdi_names - wdi_names))
print("Masih belum cocok:", missing_in_wdi)

Masih belum cocok: ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt, Arab Rep.', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia, The', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hong Kong SAR, China', 'Hungary', 'I

In [15]:
extra_map = {
    'Korea, Dem. People’s Rep.': "Korea, Dem. People's Rep.",
    'Slovakia': 'Slovak Republic',
    'Tanzania (United Republic of)': 'Tanzania',
    'Türkiye': 'Turkiye',
    'Venezuela (Bolivarian Republic of)': 'Venezuela, RB',
    'Yemen': 'Yemen, Rep.'
}

rename_map.update(extra_map)

df_hdi['Country Name'] = df_hdi['Country Name'].replace(rename_map)

wdi_names = set(df['Country Name'].unique())
hdi_names = set(df_hdi['Country Name'].unique())

missing_in_wdi = sorted(list(hdi_names - wdi_names))
print("Masih belum cocok:", missing_in_wdi)

Masih belum cocok: ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt, Arab Rep.', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia, The', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hong Kong SAR, China', 'Hungary', 'I

In [16]:
# df_merge_2023 = pd.merge(df_2023, df_hdi, on="Country Name", how="inner").dropna()

# print(f"Data akhir siap: {df_merge_2023.shape[0]} negara")
# df_merge_2023.head()

In [17]:
print(df.columns[-10:])

if '2023' in df.columns:
    print(df['2023'].notna().sum())
else:
    print("Kolom 2023 tidak ada di file WDI.")

Index(['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
       '2024;'],
      dtype='object')
0


In [18]:
# Cek tahun terakhir yang punya data non-null untuk masing-masing indikator
indicators = ["NY.GDP.PCAP.KD", "SH.STA.SUIC.P5"]

for ind in indicators:
    subset = df[df["Indicator Code"] == ind]
    valid_counts = subset.loc[:, "1960":].notna().sum().sort_index(ascending=False)
    latest_year = valid_counts[valid_counts > 0].index[0]
    print(f"{ind} → tahun terakhir dengan data:", latest_year)

IndexError: index 0 is out of bounds for axis 0 with size 0

Jadi ternyata yang suicide tahun 2023nya ngak apunya data, jadi ngak kedetect sebelumnya. Sebenarnya ngak masalah ambil data tahun 2024 biat gdp, 2021 buat suicide karna harusnya dua hal ini dari tahun ke tahun ngak signifikan bedanya. Cuma ahrus transparan aja kasi tau ini bukan pure bandingin semua data 2023. 

In [None]:
# Ini dibanding tadi ambil tahun beda beda buat dianalisis, yang GDP sama suicide ambil rata-rata 5 tahun terakhir aja
indicators = {
    "NY.GDP.PCAP.KD": "GDP_per_capita",
    "SH.STA.SUIC.P5": "Suicide_rate"
}

years = ['2018', '2019', '2020', '2021', '2022']

df_sel = df[df["Indicator Code"].isin(indicators.keys())].copy()

df_sel["mean_5yr"] = df_sel[years].mean(axis=1, skipna=True)

df_mean = df_sel[["Country Name", "Indicator Code", "mean_5yr"]]

df_mean = df_mean.drop_duplicates(subset=["Country Name", "Indicator Code"])

df_pivot = df_mean.pivot(index="Country Name", columns="Indicator Code", values="mean_5yr").reset_index()
df_pivot = df_pivot.rename(columns=indicators)

df_merge_mean = pd.merge(df_pivot, df_hdi, on="Country Name", how="inner").dropna()

df_merge_mean.head()

Unnamed: 0,Country Name,GDP_per_capita,Suicide_rate,HDI_2023
0,Afghanistan,485.022544,3.545,0.496
1,Albania,4702.593151,3.43,0.81
2,Algeria,4550.913354,2.0025,0.763
4,Angola,2528.206553,7.5475,0.616
5,Antigua and Barbuda,17011.981288,0.5575,0.851


In [None]:
import numpy as np
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt

for col in ['HDI_2023', 'GDP_per_capita', 'Suicide_rate']:
    df_merge_mean[col] = pd.to_numeric(df_merge_mean[col], errors='coerce')

df_merge_mean = df_merge_mean.dropna(subset=['HDI_2023', 'GDP_per_capita', 'Suicide_rate'])

corr_matrix = df_merge_mean[['HDI_2023', 'GDP_per_capita', 'Suicide_rate']].corr(method='pearson')

r_hdi_suicide, p_hdi_suicide = pearsonr(df_merge_mean['HDI_2023'], df_merge_mean['Suicide_rate'])
r_gdp_suicide, p_gdp_suicide = pearsonr(df_merge_mean['GDP_per_capita'], df_merge_mean['Suicide_rate'])

print("📈 Korelasi Matrix:")
print(corr_matrix.round(3))
print(f"\nHDI ↔ Suicide Rate: r = {r_hdi_suicide:.3f}, p = {p_hdi_suicide:.4f}")
print(f"GDP ↔ Suicide Rate: r = {r_gdp_suicide:.3f}, p = {p_gdp_suicide:.4f}")

📈 Korelasi Matrix:
                HDI_2023  GDP_per_capita  Suicide_rate
HDI_2023           1.000           0.701         0.238
GDP_per_capita     0.701           1.000         0.193
Suicide_rate       0.238           0.193         1.000

HDI ↔ Suicide Rate: r = 0.238, p = 0.0013
GDP ↔ Suicide Rate: r = 0.193, p = 0.0096


# Yang di atas itu semua cleaning drop Nan, di bawah ini lebih sesuai etika ASA

Kami memilih untuk tidak menghapus seluruh data yang hilang, melainkan melakukan interpolasi dan penandaan missingness, untuk menjaga inklusivitas negara-negara berkembang yang datanya tidak lengkap, sesuai prinsip keadilan dalam statistik global (ASA Ethical Guideline 2.2).

In [21]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt

# ================================
# 1️⃣ LOAD & BERSIHKAN HDI
# ================================
df_hdi = pd.read_excel("./External-Dataset/HDI.xlsx")
df_hdi.columns = df_hdi.columns.str.strip()
df_hdi = df_hdi[['Country', 'Human Development Index (HDI)']]
df_hdi = df_hdi.rename(columns={
    'Country': 'Country Name',
    'Human Development Index (HDI)': 'HDI_2023'
})
df_hdi['Country Name'] = df_hdi['Country Name'].str.strip()

# ================================
# 2️⃣ LOAD & SAMAKAN NAMA NEGARA
# ================================
df = pd.read_csv(
    "./Dataset/WDICSV.csv",
    engine="python",
    encoding="utf-8-sig",
    on_bad_lines="skip",     # lewati baris rusak
    quotechar='"',           # pastikan tanda kutip ganda dikenali
    sep=",",                 # pastikan delimiter tetap koma
)

rename_map = {
    'Bahamas': 'Bahamas, The',
    'Bolivia (Plurinational State of)': 'Bolivia',
    'Congo': 'Congo, Rep.',
    'Congo (Democratic Republic of the)': 'Congo, Dem. Rep.',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'Egypt': 'Egypt, Arab Rep.',
    'Eswatini (Kingdom of)': 'Eswatini',
    'Gambia': 'Gambia, The',
    'Hong Kong, China (SAR)': 'Hong Kong SAR, China',
    'Iran (Islamic Republic of)': 'Iran, Islamic Rep.',
    "Korea (Democratic People's Rep. of)": 'Korea, Dem. People’s Rep.',
    'Korea (Republic of)': 'Korea, Rep.',
    'Kyrgyzstan': 'Kyrgyz Republic',
    "Lao People's Democratic Republic": 'Lao PDR',
    'Micronesia (Federated States of)': 'Micronesia, Fed. Sts.',
    'Moldova (Republic of)': 'Moldova',
    'Palestine, State of': 'West Bank and Gaza',
    'Saint Kitts and Nevis': 'St. Kitts and Nevis',
    'Saint Lucia': 'St. Lucia',
    'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
    'Korea, Dem. People’s Rep.': "Korea, Dem. People's Rep.",
    'Slovakia': 'Slovak Republic',
    'Tanzania (United Republic of)': 'Tanzania',
    'Türkiye': 'Turkiye',
    'Venezuela (Bolivarian Republic of)': 'Venezuela, RB',
    'Yemen': 'Yemen, Rep.'
}
df_hdi['Country Name'] = df_hdi['Country Name'].replace(rename_map)

# ================================
# 3️⃣ PILIH INDIKATOR & HITUNG RATA-RATA 5 TAHUN
# ================================
indicators = {
    "NY.GDP.PCAP.KD": "GDP_per_capita",
    "SH.STA.SUIC.P5": "Suicide_rate"
}
years = ['2018', '2019', '2020', '2021', '2022']

df_sel = df[df["Indicator Code"].isin(indicators.keys())].copy()

# Hitung mean dan coverage (berapa persen tahun yang punya data)
df_sel["mean_5yr"] = df_sel[years].mean(axis=1, skipna=True)
df_sel["data_coverage"] = df_sel[years].notna().sum(axis=1) / len(years)

df_mean = df_sel[["Country Name", "Indicator Code", "mean_5yr", "data_coverage"]].drop_duplicates()

df_pivot = df_mean.pivot(index="Country Name", columns="Indicator Code", values="mean_5yr").reset_index()
df_pivot = df_pivot.rename(columns=indicators)

df_coverage = df_mean.pivot(index="Country Name", columns="Indicator Code", values="data_coverage").reset_index()
df_coverage = df_coverage.rename(columns={code: indicators[code] + "_coverage" for code in indicators})

# ================================
# 4️⃣ GABUNGKAN SEMUA DATA
# ================================
df_merge_mean = (
    df_pivot
    .merge(df_hdi, on="Country Name", how="left")
    .merge(df_coverage, on="Country Name", how="left")
)

# Interpolasi nilai yang hilang untuk inklusivitas
df_merge_mean[['GDP_per_capita', 'Suicide_rate', 'HDI_2023']] = df_merge_mean[
    ['GDP_per_capita', 'Suicide_rate', 'HDI_2023']
].interpolate(method='linear', limit_direction='both')

# Tambahkan flag etis: data reliability
df_merge_mean["Low_data_quality_flag"] = np.where(
    (df_merge_mean["GDP_per_capita_coverage"] < 0.6) |
    (df_merge_mean["Suicide_rate_coverage"] < 0.6),
    "⚠️ Low data reliability",
    "✅ Sufficient data"
)

print("\n✅ Data akhir siap dipakai secara etis & transparan:")
print(df_merge_mean.head())

# ================================
# 5️⃣ ANALISIS KORELASI
# ================================
for col in ['HDI_2023', 'GDP_per_capita', 'Suicide_rate']:
    df_merge_mean[col] = pd.to_numeric(df_merge_mean[col], errors='coerce')

corr_matrix = df_merge_mean[['HDI_2023', 'GDP_per_capita', 'Suicide_rate']].corr(method='pearson')

r_hdi_suicide, p_hdi_suicide = pearsonr(df_merge_mean['HDI_2023'], df_merge_mean['Suicide_rate'])
r_gdp_suicide, p_gdp_suicide = pearsonr(df_merge_mean['GDP_per_capita'], df_merge_mean['Suicide_rate'])

print("\n📊 Korelasi Matrix:")
print(corr_matrix.round(3))
print(f"\nHDI ↔ Suicide Rate: r = {r_hdi_suicide:.3f}, p = {p_hdi_suicide:.4f}")
print(f"GDP ↔ Suicide Rate: r = {r_gdp_suicide:.3f}, p = {p_gdp_suicide:.4f}")

# ================================
# 6️⃣ CATATAN ETIS
# ================================
print("\n📘 Catatan Etis:")
print("Kami tidak menghapus seluruh observasi dengan data hilang.")
print("Sebaliknya, kami melakukan interpolasi dan memberi penanda 'Low data reliability'")
print("untuk negara dengan cakupan data <60%, agar analisis tetap inklusif sesuai prinsip ASA 2.2.")

KeyError: "['GDP_per_capita', 'Suicide_rate'] not in index"

Kami melakukan interpolasi terbatas hanya pada GDP per kapita, mengingat indikator ini memiliki pola pertumbuhan yang relatif stabil antar tahun. Untuk indikator sensitif seperti tingkat bunuh diri, kami mempertahankan missing value agar tidak mengaburkan realitas sosial. Kami juga menandai negara dengan cakupan data rendah, untuk memastikan transparansi dan keadilan dalam interpretasi hasil.

## Time Series Dataset in 2000s

Lihat apakah pertumbuhan ekonomi (GDP per capita) di suatu negara berjalan seiring atau berlawanan dengan tren bunuh diri.

In [None]:
indicators = {
    "NY.GDP.PCAP.KD": "GDP_per_capita",
    "SH.STA.SUIC.P5": "Suicide_rate"
}

years = [str(y) for y in range(2000, 2024)]

df_long = df[df["Indicator Code"].isin(indicators.keys())][["Country Name", "Indicator Code"] + years]

df_melt = df_long.melt(id_vars=["Country Name", "Indicator Code"], var_name="Year", value_name="Value")
df_pivot = df_melt.pivot_table(index=["Country Name", "Year"], columns="Indicator Code", values="Value").reset_index()
df_pivot = df_pivot.rename(columns=indicators)

df_pivot["Year"] = df_pivot["Year"].astype(int)
df_pivot = df_pivot.dropna(subset=["GDP_per_capita", "Suicide_rate"])

df_pivot.head()

Indicator Code,Country Name,Year,GDP_per_capita,Suicide_rate
0,Afghanistan,2000,308.31827,4.36
1,Afghanistan,2001,277.118051,4.38
2,Afghanistan,2002,338.139974,4.26
3,Afghanistan,2003,346.071627,4.24
4,Afghanistan,2004,338.637274,4.23


In [None]:
# Ini versi cleaning yang lebih etis dibanding di atas ya

indicators = {
    "NY.GDP.PCAP.KD": "GDP_per_capita",
    "SH.STA.SUIC.P5": "Suicide_rate"
}

years = [str(y) for y in range(2000, 2024)]

df_long = df[df["Indicator Code"].isin(indicators.keys())][["Country Name", "Indicator Code"] + years]

# Ubah ke format long
df_melt = df_long.melt(id_vars=["Country Name", "Indicator Code"], 
                       var_name="Year", value_name="Value")

# Pivot ke wide format
df_pivot = df_melt.pivot_table(index=["Country Name", "Year"], 
                               columns="Indicator Code", 
                               values="Value").reset_index()
df_pivot = df_pivot.rename(columns=indicators)
df_pivot["Year"] = df_pivot["Year"].astype(int)

# ======================================
# 🌱 Bagian Etis & Inklusif
# ======================================

# 1️⃣ Hitung proporsi data yang tersedia per negara
coverage = df_pivot.groupby("Country Name")[["GDP_per_capita", "Suicide_rate"]].apply(
    lambda x: x.notna().mean()
).reset_index()
coverage.columns = ["Country Name", "GDP_per_capita_coverage", "Suicide_rate_coverage"]

# 2️⃣ Interpolasi data hilang antar tahun (jaga kontinuitas, tidak manipulatif)
df_pivot = df_pivot.sort_values(["Country Name", "Year"])
df_pivot[["GDP_per_capita", "Suicide_rate"]] = (
    df_pivot.groupby("Country Name")[["GDP_per_capita", "Suicide_rate"]]
    .apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
    .reset_index(drop=True)
)

# 3️⃣ Tambahkan flag reliabilitas berdasarkan coverage historis
df_pivot = df_pivot.merge(coverage, on="Country Name", how="left")
df_pivot["Data_quality_flag"] = np.where(
    (df_pivot["GDP_per_capita_coverage"] < 0.6) | 
    (df_pivot["Suicide_rate_coverage"] < 0.6),
    "⚠️ Low data reliability",
    "✅ Sufficient data"
)

# 4️⃣ Dokumentasi etis
print("\n📘 Catatan Etis:")
print("- Tidak semua negara memiliki data lengkap antara 2000–2023.")
print("- Missing values diinterpolasi agar tren tetap terwakili.")
print("- Negara dengan <60% data valid ditandai sebagai 'Low data reliability'.")
print("- Pendekatan ini menjaga keadilan statistik dan menghindari bias terhadap negara berkembang.")

df_pivot.head()


📘 Catatan Etis:
- Tidak semua negara memiliki data lengkap antara 2000–2023.
- Missing values diinterpolasi agar tren tetap terwakili.
- Negara dengan <60% data valid ditandai sebagai 'Low data reliability'.
- Pendekatan ini menjaga keadilan statistik dan menghindari bias terhadap negara berkembang.


Unnamed: 0,Country Name,Year,GDP_per_capita,Suicide_rate,GDP_per_capita_coverage,Suicide_rate_coverage,Data_quality_flag
0,Afghanistan,2000,308.31827,4.36,1.0,0.916667,✅ Sufficient data
1,Afghanistan,2001,277.118051,4.38,1.0,0.916667,✅ Sufficient data
2,Afghanistan,2002,338.139974,4.26,1.0,0.916667,✅ Sufficient data
3,Afghanistan,2003,346.071627,4.24,1.0,0.916667,✅ Sufficient data
4,Afghanistan,2004,338.637274,4.23,1.0,0.916667,✅ Sufficient data


## Who Handles Growth Best?

Hitung seberapa besar suicide rate berubah terhadap kenaikan GDP per negara.

In [None]:
df = pd.read_csv("WDICSV.csv")

indicators = ['NY.GDP.PCAP.KD', 'SH.STA.SUIC.P5']
df_long = df[df["Indicator Code"].isin(indicators)]

years = [str(y) for y in range(2000, 2024)]
df_long = df_long.melt(
    id_vars=["Country Name", "Indicator Code"],
    value_vars=years,
    var_name="Year",
    value_name="Value"
)
df_long["Year"] = df_long["Year"].astype(int)
df_long["Value"] = pd.to_numeric(df_long["Value"], errors="coerce")

df_long.head()

Unnamed: 0,Country Name,Indicator Code,Year,Value
0,Africa Eastern and Southern,NY.GDP.PCAP.KD,2000,1196.929766
1,Africa Eastern and Southern,SH.STA.SUIC.P5,2000,7.821675
2,Africa Western and Central,NY.GDP.PCAP.KD,2000,1184.360138
3,Africa Western and Central,SH.STA.SUIC.P5,2000,5.663385
4,Arab World,NY.GDP.PCAP.KD,2000,4915.070984


In [None]:
df_pivot = df_long.pivot_table(
    index=["Country Name", "Year"],
    columns="Indicator Code",
    values="Value"
).reset_index()

df_pivot = df_pivot.rename(columns={
    'NY.GDP.PCAP.KD': 'GDP_per_capita',
    'SH.STA.SUIC.P5': 'Suicide_rate'
})

df_pivot.head()

Indicator Code,Country Name,Year,GDP_per_capita,Suicide_rate
0,Afghanistan,2000,308.31827,4.36
1,Afghanistan,2001,277.118051,4.38
2,Afghanistan,2002,338.139974,4.26
3,Afghanistan,2003,346.071627,4.24
4,Afghanistan,2004,338.637274,4.23


In [None]:
df_pivot = df_pivot.sort_values(["Country Name", "Year"])
df_pivot["GDP_growth"] = df_pivot.groupby("Country Name")["GDP_per_capita"].pct_change() * 100
df_pivot["Suicide_change"] = df_pivot.groupby("Country Name")["Suicide_rate"].pct_change() * 100

  df_pivot["GDP_growth"] = df_pivot.groupby("Country Name")["GDP_per_capita"].pct_change() * 100
  df_pivot["Suicide_change"] = df_pivot.groupby("Country Name")["Suicide_rate"].pct_change() * 100


In [None]:
resilience_df = (
    df_pivot[df_pivot["Year"] >= 2000]
    .groupby("Country Name")[["GDP_growth", "Suicide_change"]]
    .mean()
    .reset_index()
)

In [None]:
resilience_df = pd.merge(resilience_df, df_hdi, on="Country Name", how="left")
resilience_df.head()

Unnamed: 0,Country Name,GDP_growth,Suicide_change,HDI_2023
0,Afghanistan,1.295866,-0.811508,0.496
1,Africa Eastern and Southern,0.742705,0.303258,
2,Africa Western and Central,1.961359,0.086982,
3,Albania,4.506631,-1.423423,0.81
4,Algeria,1.212007,-2.545643,0.763


In [None]:
# Ini versi lebih transparan dari yang atas ya

# === 1️⃣ Siapkan data dasar ===
df = pd.read_csv("WDICSV.csv")

indicators = ['NY.GDP.PCAP.KD', 'SH.STA.SUIC.P5']
df_long = df[df["Indicator Code"].isin(indicators)]

years = [str(y) for y in range(2000, 2024)]
df_long = df_long.melt(
    id_vars=["Country Name", "Indicator Code"],
    value_vars=years,
    var_name="Year",
    value_name="Value"
)
df_long["Year"] = df_long["Year"].astype(int)
df_long["Value"] = pd.to_numeric(df_long["Value"], errors="coerce")

# === 2️⃣ Pivot ke wide format ===
df_pivot = df_long.pivot_table(
    index=["Country Name", "Year"],
    columns="Indicator Code",
    values="Value"
).reset_index()

df_pivot = df_pivot.rename(columns={
    'NY.GDP.PCAP.KD': 'GDP_per_capita',
    'SH.STA.SUIC.P5': 'Suicide_rate'
})

# === 3️⃣ Interpolasi untuk keadilan data (jangan drop negara berkembang) ===
df_pivot = df_pivot.sort_values(["Country Name", "Year"])
df_pivot[["GDP_per_capita", "Suicide_rate"]] = (
    df_pivot.groupby("Country Name")[["GDP_per_capita", "Suicide_rate"]]
    .apply(lambda g: g.interpolate(method='linear', limit_direction='both'))
    .reset_index(drop=True)
)

# === 4️⃣ Hitung pertumbuhan & perubahan ===
df_pivot["GDP_growth"] = df_pivot.groupby("Country Name")["GDP_per_capita"].pct_change() * 100
df_pivot["Suicide_change"] = df_pivot.groupby("Country Name")["Suicide_rate"].pct_change() * 100

# Batasi outlier ekstrem agar tidak menyesatkan
df_pivot["GDP_growth"] = df_pivot["GDP_growth"].clip(-100, 100)
df_pivot["Suicide_change"] = df_pivot["Suicide_change"].clip(-100, 100)

# === 5️⃣ Hitung coverage data ===
coverage = df_pivot.groupby("Country Name")[["GDP_per_capita", "Suicide_rate"]].apply(
    lambda x: x.notna().mean()
).reset_index()
coverage.columns = ["Country Name", "GDP_coverage", "Suicide_coverage"]

# === 6️⃣ Hitung resilience per negara ===
resilience_df = (
    df_pivot.groupby("Country Name")[["GDP_growth", "Suicide_change"]]
    .mean()
    .reset_index()
)

# === 7️⃣ Gabungkan dengan HDI dan coverage ===
resilience_df = (
    resilience_df
    .merge(df_hdi, on="Country Name", how="left")
    .merge(coverage, on="Country Name", how="left")
)

# === 8️⃣ Tambahkan flag reliabilitas ===
resilience_df["Data_quality_flag"] = np.where(
    (resilience_df["GDP_coverage"] < 0.6) | (resilience_df["Suicide_coverage"] < 0.6),
    "⚠️ Low data reliability",
    "✅ Reliable"
)

print("✅ Data Resilience dengan Pertimbangan Etis:")
print(resilience_df.head())

print("\n📘 Catatan Etis:")
print("- Missing data diinterpolasi agar tren tidak bias terhadap negara berkembang.")
print("- Negara dengan data coverage <60% ditandai sebagai 'Low data reliability'.")
print("- Pertumbuhan ekstrem dibatasi antara -100% s.d. +100% untuk mencegah outlier misleading.")

✅ Data Resilience dengan Pertimbangan Etis:
                  Country Name  GDP_growth  Suicide_change HDI_2023  \
0                  Afghanistan    1.295866       -0.811508    0.496   
1  Africa Eastern and Southern    0.742705        0.303258      NaN   
2   Africa Western and Central    1.961359        0.086982      NaN   
3                      Albania    4.506631       -1.423423     0.81   
4                      Algeria    1.212007       -2.545643    0.763   

   GDP_coverage  Suicide_coverage Data_quality_flag  
0           1.0               1.0        ✅ Reliable  
1           1.0               1.0        ✅ Reliable  
2           1.0               1.0        ✅ Reliable  
3           1.0               1.0        ✅ Reliable  
4           1.0               1.0        ✅ Reliable  

📘 Catatan Etis:
- Missing data diinterpolasi agar tren tidak bias terhadap negara berkembang.
- Negara dengan data coverage <60% ditandai sebagai 'Low data reliability'.
- Pertumbuhan ekstrem dibatasi ant

  df_pivot["GDP_growth"] = df_pivot.groupby("Country Name")["GDP_per_capita"].pct_change() * 100
  df_pivot["Suicide_change"] = df_pivot.groupby("Country Name")["Suicide_rate"].pct_change() * 100


# Feature Engineering

In [None]:
# ==========================================
# 🎯 FEATURE ENGINEERING FOR “THE PRICE OF PROGRESS”
# ==========================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Asumsikan kamu sudah punya dataframe gabungan bernama df
# Columns example: ['Country', 'Year', 'HDI', 'Suicide_rate', 'GDP_per_capita', 'Gini', 'Happiness', ...]

# ===============================
# 1️⃣ Handle Missing Values (basic imputation)
# ===============================
# Impute dengan interpolation (time-based) dan fallback dengan median by country
df = df.sort_values(['Country', 'Year'])
df['HDI'] = df.groupby('Country')['HDI'].apply(lambda x: x.interpolate(method='linear').fillna(x.median()))
df['Suicide_rate'] = df.groupby('Country')['Suicide_rate'].apply(lambda x: x.interpolate(method='linear').fillna(x.median()))
df['GDP_per_capita'] = df.groupby('Country')['GDP_per_capita'].apply(lambda x: x.interpolate(method='linear').fillna(x.median()))
df['Gini'] = df.groupby('Country')['Gini'].apply(lambda x: x.interpolate(method='linear').fillna(x.median()))
df['Happiness'] = df.groupby('Country')['Happiness'].apply(lambda x: x.interpolate(method='linear').fillna(x.median()))

# ===============================
# 2️⃣ Nonlinear Features
# ===============================
df['HDI_sq'] = df['HDI'] ** 2
df['log_GDP'] = np.log1p(df['GDP_per_capita'])  # log(1+GDP) to handle 0
df['GDP_growth'] = df.groupby('Country')['GDP_per_capita'].pct_change() * 100  # percent growth

# ===============================
# 3️⃣ Composite Indices
# ===============================
# HDI disesuaikan dengan ketimpangan
df['Inequality_adj_HDI'] = df['HDI'] * (1 - df['Gini'] / 100)

# Selisih antara HDI dan Happiness (gap development vs happiness)
df['Happiness_gap'] = df['Happiness'] - df['HDI']

# Jika ada variabel unemployment atau urbanization:
if 'Unemployment' in df.columns:
    df['Unemployment_rate_sq'] = df['Unemployment'] ** 2

if 'Urbanization' in df.columns:
    df['Urbanization_log'] = np.log1p(df['Urbanization'])

# ===============================
# 4️⃣ Temporal & Lag Features
# ===============================
# Melihat efek waktu sebelumnya (lagged variables)
for col in ['HDI', 'GDP_per_capita', 'Suicide_rate']:
    df[f'{col}_lag1'] = df.groupby('Country')[col].shift(1)
    df[f'{col}_change'] = df[col] - df[f'{col}_lag1']

# ===============================
# 5️⃣ Standardization (Z-scores for analysis & PCA)
# ===============================
scaler = StandardScaler()
cols_to_scale = ['HDI', 'Suicide_rate', 'GDP_per_capita', 'Gini', 'Happiness',
                 'HDI_sq', 'log_GDP', 'GDP_growth', 'Inequality_adj_HDI', 'Happiness_gap']
scaled = pd.DataFrame(scaler.fit_transform(df[cols_to_scale]), columns=[f'{c}_z' for c in cols_to_scale])
df = pd.concat([df, scaled], axis=1)

# ===============================
# 6️⃣ Region & Income Group Encoding (if available)
# ===============================
# Jika kamu punya kolom 'Region' atau 'IncomeGroup'
if 'Region' in df.columns:
    region_dummies = pd.get_dummies(df['Region'], prefix='Region')
    df = pd.concat([df, region_dummies], axis=1)

if 'IncomeGroup' in df.columns:
    income_dummies = pd.get_dummies(df['IncomeGroup'], prefix='Income')
    df = pd.concat([df, income_dummies], axis=1)

# ===============================
# 7️⃣ Composite Mental Health Proxy (optional)
# ===============================
# Jika kamu punya variabel alkohol, unemployment, dll → buat indeks tekanan sosial
possible_stressors = [col for col in ['Unemployment', 'Alcohol_consumption', 'Gini'] if col in df.columns]
if len(possible_stressors) > 0:
    df['Social_stress_index'] = df[possible_stressors].mean(axis=1)

# ===============================
# 8️⃣ Final Check & Save
# ===============================
print("✅ Feature Engineering Completed!")
print("Columns now available:", df.columns.tolist())

# Save processed data
df.to_csv("data/processed/feature_engineered_dataset.csv", index=False)
