#**Data Preparation**

###**Load Dataset**

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor


df=pd.read_csv("suicide_experiment.csv") # Load dataset
df.head()

Unnamed: 0,year,sex,age,suicides/100k pop,HDI for year,gdp_per_capita ($),generation
0,1987,male,15-24 years,6.71,0.793666,796.0,Generation X
1,1987,male,35-54 years,5.19,0.793666,796.0,Silent
2,1987,female,15-24 years,4.83,0.793666,796.0,Generation X
3,1987,male,75+ years,4.59,0.793666,796.0,G.I. Generation
4,1987,male,25-34 years,3.28,0.793666,796.0,Boomers


### **Drop Unnecessary column**

- Sex, age, and generation were removed because the study focuses on macro-level, country-wide wellbeing

In [11]:
df_exp = df.copy()
df_exp = df_exp.drop(['sex', 'age', 'generation','year'],
                       axis=1)
df_exp.head()

Unnamed: 0,suicides/100k pop,HDI for year,gdp_per_capita ($)
0,6.71,0.793666,796.0
1,5.19,0.793666,796.0
2,4.83,0.793666,796.0
3,4.59,0.793666,796.0
4,3.28,0.793666,796.0


#**Experiment**
- The dataset was split into 80–20 and 70–30 training–testing sets.
- The algorithms used were Random Forest Regressor, Linear Regression, k-NN regressor, SVR and ANN
- The experiment will be evaluate using RMSE and MAE




### **Prepare features and target**

In [12]:
X = df_exp[["HDI for year", "gdp_per_capita ($)"]]
y = df_exp["suicides/100k pop"]

### **80-20 Split**
Dataset were split into 80% for train and 20% for test

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (25404, 2)
X_test shape: (6352, 2)
y_train shape: (25404,)
y_test shape: (6352,)


###**Feature scaling**
The scale is only applied to the features.

In [15]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

###**Model**

In [16]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=100,
        random_state=42
    ),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    "SVR": SVR(kernel="rbf"),
    "MLP Regressor (ANN)": MLPRegressor(
        hidden_layer_sizes=(100,),
        max_iter=1000,
        random_state=42
    )
}


###**Evaluate**

In [17]:
results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    results.append({
        "Model": name,
        "RMSE": rmse,
        "MAE": mae
    })

results_df = pd.DataFrame(results)
results_df.sort_values(by="RMSE", inplace=True)

print(results_df)

                     Model       RMSE        MAE
1  Random Forest Regressor  18.862399  10.465189
2            KNN Regressor  19.377295  10.757867
4      MLP Regressor (ANN)  22.755267  12.168725
0        Linear Regression  22.874671  12.459825
3                      SVR  23.960785  10.529024


### **70-30 Split**
Dataset were split into 70% for train and 30% for test

In [4]:
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"X_train shape: {X_train_b.shape}")
print(f"X_test shape: {X_test_b.shape}")
print(f"y_train shape: {y_train_b.shape}")
print(f"y_test shape: {y_test_b.shape}")

X_train shape: (22229, 2)
X_test shape: (9527, 2)
y_train shape: (22229,)
y_test shape: (9527,)


###**Feature scaling**
The scale is only applied to the features.

In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_b)
X_test_scaled = scaler.transform(X_test_b)

###**Model**

In [6]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=100,
        random_state=42
    ),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    "SVR": SVR(kernel="rbf"),
    "MLP Regressor (ANN)": MLPRegressor(
        hidden_layer_sizes=(100,),
        max_iter=1000,
        random_state=42
    )
}

###**Evaluate**

In [9]:
results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train_b)
    y_pred = model.predict(X_test_scaled)

    rmse = np.sqrt(mean_squared_error(y_test_b, y_pred))
    mae = mean_absolute_error(y_test_b, y_pred)

    results.append({
        "Model": name,
        "RMSE": rmse,
        "MAE": mae
    })

results_df = pd.DataFrame(results)
results_df.sort_values(by="RMSE", inplace=True)

print(results_df)

                     Model       RMSE        MAE
1  Random Forest Regressor  18.509725  10.506907
2            KNN Regressor  19.641994  10.846842
4      MLP Regressor (ANN)  21.590986  12.176135
0        Linear Regression  21.728248  12.387413
3                      SVR  22.771603  10.365720
