## Modulo para Preprocesar los datos por medio de una clase

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from collections import Counter

class preprocess_module():
    
    def preprocess(self,archivo):
        #Leemos el archivo
        df = pd.read_csv(archivo)

        #Eliminamos la columna ID
        df = df.drop(["Participant ID"], axis=1)

        #Completamos valores faltantes con las categorías que mayor se repiten
        nan_col = ["Medical History","Psychiatric History","Substance Use"]
        for col in nan_col:
            df[col] = df[col].fillna(df[col].value_counts().idxmax())

        #Variables categóricas 1 (sin One Hot Enconding)
        yes_no = ["Family History","Personal History",] #Columnas con respuesta yes o no
        levels1 = ["Current Stressors","Social Support"] #Columnas con respuesta low, moderate, high
        for a,b in zip(yes_no,levels1):
            df[a] = df[a].map({"Yes":1,"No":0})
            df[b] = df[b].map({"Low":0,"Moderate":1,"High":2}) 

        #Singulares
        df["Gender"] = df["Gender"].map({"Male":0,"Female":1})
        df["Demographics"] = df["Demographics"].map({"Rural":0,"Urban":1})
        df["Impact on Life"] = df["Impact on Life"].map({"Mild":0,"Moderate":1,"Significant":2})
        df["Severity"] = df["Severity"].map({"Mild":0,"Moderate":1,"Severe":2})

        #Variables Categórcias 2 (Con One Hot Encoding)
        dumm = [c for c in df_cat2.columns if df_cat2[c].dtype == "O"]
        df = pd.get_dummies(df,dumm, drop_first=True).astype(int)

        #Reacomodamos las columnas
        df = df[['Symptoms_Dizziness',
           'Symptoms_Fear of losing control', 'Symptoms_Panic attacks',
           'Symptoms_Shortness of breath', 'Medical History_Diabetes',
           'Medical History_Heart disease', 'Psychiatric History_Bipolar disorder',
           'Psychiatric History_Depressive disorder', 'Substance Use_Drugs',
           'Coping Mechanisms_Meditation', 'Coping Mechanisms_Seeking therapy',
           'Coping Mechanisms_Socializing', 'Lifestyle Factors_Exercise',
           'Lifestyle Factors_Sleep quality','Age', 'Gender', 'Family History', 'Personal History',
           'Current Stressors', 'Severity', 'Impact on Life', 'Demographics',
           'Social Support', 'Panic Disorder Diagnosis']]
        return df
    
    def balance(self,df,sampling_strategy,n_neighbors):
    
        #Separamos los datos
        X = df_sep.iloc[:,:-1]
        Y = df_sep.iloc[:,-1]

        #Partimos en entrenamiento y testeo
        x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state = 42 )

        #Balanceamos los datos
        smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42, k_neighbors=n_neighbors)
        enn = EditedNearestNeighbours(n_neighbors=n_neighbors)
        smote_enn = SMOTEENN(sampling_strategy=sampling_strategy,smote=smote, enn=enn )
        x_resampled, y_resampled = smote_enn.fit_resample(x_train,y_train)

        return x_resampled, y_resampled

    def info(self):
        print("No:0, Yes:1")
        print("Low:0, Moderate:1, High:2")
        print("Male:0, Female:1")
        print("Rural:0, Urban:1")
        print("Mild:0, Moderate:1, Significant:2")
        print("Mild:0, Moderate:1, Severe:2")            