In [1]:
#!pip3 install scikit-learn
!pip3 install xgboost
!pip3 install pyspark
!pip3 install duckdb

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=4fe2d79365f93440b9ad3fcb61e4d9072641581edce93d04998f55b79ba923fa
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import pyspark
from pyspark.sql import SparkSession

import pandas as pd

import duckdb

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import joblib

In [3]:
!git clone https://github.com/OscarMoliina/betterlifebetterhealth.git

Cloning into 'betterlifebetterhealth'...
remote: Enumerating objects: 332, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 332 (delta 56), reused 120 (delta 34), pack-reused 155[K
Receiving objects: 100% (332/332), 100.30 MiB | 23.70 MiB/s, done.
Resolving deltas: 100% (106/106), done.


In [4]:
spark = SparkSession.builder \
    .appName("Preprocessing") \
    .config("spark.jars", "/content/betterlifebetterhealth/src/utils/duckdb.jar") \
    .getOrCreate()

data = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:/content/betterlifebetterhealth/data/db/exploitation_zone.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "join_table") \
    .load()

# Comparativa Dataset Simple vs. Dataset Mitxe

In [5]:
df = data.toPandas()

y = df['Depression (%)']
X1 = df[['Schizophrenia (%)', 'Bipolar disorder (%)', 'Eating disorders (%)',
         'Anxiety disorders (%)', 'Drug use disorders (%)', 'Alcohol use disorders (%)']]
X2 = df.drop(['Country', 'Year', 'Depression (%)'], axis=1)

X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=2003)
X2_train, X2_test = train_test_split(X2, test_size=0.2, random_state=2003)

scaler = StandardScaler()
X1_train_scaled = scaler.fit_transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)
X2_train_scaled = scaler.fit_transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

In [6]:
svm_model1 = SVR()
svm_model2 = SVR()
svm_model1.fit(X1_train_scaled, y_train)
svm_model2.fit(X2_train_scaled, y_train)

# joblib.dump(scaler, 'scaler1.pkl')
# joblib.dump(svm_model1, 'svm_model1.pkl')
# joblib.dump(svm_model2, 'svm_model2.pkl')

y_pred1 = svm_model1.predict(X1_test_scaled)
y_pred2 = svm_model2.predict(X2_test_scaled)
mse1 = mean_squared_error(y_test, y_pred1)
mse2 = mean_squared_error(y_test, y_pred2)

print("MSE para el modelo con solo datos de enfermedades:", mse1)
print("MSE para el modelo con todas las columnas:", mse2)


MSE para el modelo con solo datos de enfermedades: 0.003019290312440158
MSE para el modelo con todas las columnas: 0.005844207276130293


# Model Pipeline

In [7]:
import joblib
from pyspark.sql import DataFrame
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

class ModelPipeline:
    def __init__(self, data: DataFrame, objective:str, model_type='SVM', model=None, scaler=None) -> None:
        self.data = data
        self.objective = objective
        self.model_type = model_type
        self.model = model
        self.scaler = scaler if scaler is not None else StandardScaler()

    def __train_model(self):
        df = self.data.toPandas()
        y = df[self.objective]
        X = df.drop(['Country', 'Year', self.objective], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2003)

        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        if self.model_type == 'SVM':
            self.model = SVR()
        elif self.model_type == 'RF':
            self.model = RandomForestRegressor()
        elif self.model_type == 'XGB':
            self.model = XGBRegressor(objective='reg:squarederror')

        self.model.fit(X_train_scaled, y_train)
        y_pred = self.model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)

        joblib.dump(self.model, f'{self.model_type}_model.pkl')
        joblib.dump(self.scaler, 'scaler.pkl')

        return self.model, self.scaler, None, mse

    def __make_predictions(self):
        df = self.data.toPandas()
        X = df.drop(['Country', 'Year', self.objective], axis=1)
        X_scaled = self.scaler.transform(X)

        y_pred = self.model.predict(X_scaled)

        return self.model, self.scaler, y_pred, None

    def predict(self):
        if self.model is None or isinstance(self.model, str) and not self.model.endswith('.pkl'):
            return self.__train_model()
        else:
            self.model = joblib.load(self.model) if isinstance(self.model, str) else self.model
            self.scaler = joblib.load(self.scaler) if isinstance(self.scaler, str) else self.scaler
            return self.__make_predictions()

# Exemples d'Ús

In [8]:
pipline = ModelPipeline(data=data, objective='Depression (%)', model_type='XGB', scaler=None, model=None)
model, scaler, predictions, mse = pipline.predict()

In [9]:
print(model)
print(scaler)
print(predictions)
print(mse)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
StandardScaler()
None
6.929342554497845e-05


In [10]:
pipline = ModelPipeline(data=data, objective='Depression (%)',model_type='SVM')
model, scaler, predictions, mse = pipline.predict()

In [11]:
print(model)
print(scaler)
print(predictions)
print(mse)

SVR()
StandardScaler()
None
0.005844207276130293


In [12]:
pipline = ModelPipeline(data=data, objective='Depression (%)',model_type='RF')
model, scaler, predictions, mse = pipline.predict()

In [13]:
print(model)
print(scaler)
print(predictions)
print(mse)

RandomForestRegressor()
StandardScaler()
None
5.399841463131409e-05
