In [None]:
#!pip3 install scikit-learn

In [1]:
import pyspark
from pyspark.sql import SparkSession

import pandas as pd

import duckdb

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import joblib

In [2]:
spark = SparkSession.builder \
    .appName("Preprocessing") \
    .config("spark.jars", "../utils/duckdb.jar") \
    .getOrCreate()

data = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:../../data/db/exploitation_zone.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "join_table") \
    .load()

24/04/24 13:24:30 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 10.192.117.36 instead (on interface wlp0s20f3)
24/04/24 13:24:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/24 13:24:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Comparativa Dataset Simple vs. Dataset Mitxe

In [3]:
df = data.toPandas()

# Suponemos que df es tu DataFrame y ya está cargado
y = df['Depression (%)']
X1 = df[['Schizophrenia (%)', 'Bipolar disorder (%)', 'Eating disorders (%)', 
         'Anxiety disorders (%)', 'Drug use disorders (%)', 'Alcohol use disorders (%)']]
X2 = df.drop(['Country', 'Year', 'Depression (%)'], axis=1)

# Dividir los datos en conjuntos de entrenamiento y prueba
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=2003)
X2_train, X2_test = train_test_split(X2, test_size=0.2, random_state=2003)

# Escalado de características
scaler = StandardScaler()
X1_train_scaled = scaler.fit_transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)
X2_train_scaled = scaler.fit_transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

24/04/24 13:24:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [4]:
svm_model1 = SVR()
svm_model2 = SVR()
svm_model1.fit(X1_train_scaled, y_train)
svm_model2.fit(X2_train_scaled, y_train)

# joblib.dump(scaler, 'scaler1.pkl')
# joblib.dump(svm_model1, 'svm_model1.pkl')
# joblib.dump(svm_model2, 'svm_model2.pkl')

y_pred1 = svm_model1.predict(X1_test_scaled)
y_pred2 = svm_model2.predict(X2_test_scaled)
mse1 = mean_squared_error(y_test, y_pred1)
mse2 = mean_squared_error(y_test, y_pred2)

print("MSE para el modelo con solo datos de enfermedades:", mse1)
print("MSE para el modelo con todas las columnas:", mse2)


MSE para el modelo con solo datos de enfermedades: 0.014206915090213417
MSE para el modelo con todas las columnas: 0.011462895816162895


# Model Pipeline

In [5]:
import joblib
from pyspark.sql import DataFrame
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class ModelPipelineSVM:
    def __init__(self, data: DataFrame, objective:str, model=None, scaler=None) -> None:
        self.model = model
        self.data = data
        self.objective = objective
        self.scaler = scaler if scaler is not None else StandardScaler()

    def __train_model(self):
        df = self.data.toPandas()
        y = df[self.objective]
        X = df.drop(['Country', 'Year', self.objective], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2003)
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        self.model = SVR()
        self.model.fit(X_train_scaled, y_train)
        
        y_pred = self.model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)
        
        joblib.dump(self.model, 'svm_model.pkl')
        joblib.dump(self.scaler, 'scaler.pkl')
        
        return self.model, self.scaler, None, mse

    def __make_predictions(self):
        df = self.data.toPandas()
        X = df.drop(['Country', 'Year', self.objective], axis=1)
        X_scaled = self.scaler.transform(X)
        
        y_pred = self.model.predict(X_scaled)
        
        return self.model, self.scaler, y_pred, None

    def predict(self):
        if self.model is None or isinstance(self.model, str) and not self.model.endswith('.pkl'):
            return self.__train_model()
        else:
            self.model = joblib.load(self.model) if isinstance(self.model, str) else self.model
            self.scaler = joblib.load(self.scaler) if isinstance(self.scaler, str) else self.scaler
            return self.__make_predictions()

# Exemples d'Ús

In [6]:
pipline = ModelPipelineSVM(data=data, objective='Depression (%)', scaler='scaler.pkl', model='svm_model.pkl')
model, scaler, predictions, mse = pipline.predict()

In [7]:
print(model)
print(scaler)
print(predictions)
print(mse)

SVR()
StandardScaler()
[3.44743687 2.59499424 3.47901869 ... 3.52609215 3.52361465 3.56079922]
None


In [8]:
pipline = ModelPipelineSVM(data=data, objective='Depression (%)')
model, scaler, predictions, mse = pipline.predict()

In [9]:
print(model)
print(scaler)
print(predictions)
print(mse)

SVR()
StandardScaler()
None
0.011462895816162895
