**LEAD University - Minería de datos**

Python Project

**Contributors**
- Carolina Salas Moreno
- Deykel Bernard Salazar
- Esteban Ramirez Montano
- Kristhel Porras Mata
- Marla Gomez Hernández


## Requirements
**Step 1:** Please install Microsoft C++ Build Tools in your machine.

**Step 2:** Install Python 3.11.7

**Step 3:** Run the following code if this is your first time running it `pip install -r requirements.txt`

# Importar las librerías

In [None]:
# Main Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import umap.umap_ as umap

# Data Optimization
from sklearn_genetic import GASearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn_genetic.space import Integer, Categorical, Continuous
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor

#Feature Selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LassoCV 

# Clustering Libraries
from abc import ABCMeta, abstractmethod
from scipy.cluster.hierarchy import dendrogram, ward, single, complete, average, linkage, fcluster
from sklearn.cluster import KMeans
from pyclustering.cluster.kmedoids import kmedoids

# Dimensionality Reduction
from prince import PCA as PCA_Prince
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Evaluation Metrics
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Classification Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor

# Additional Tools
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# EDA

In [None]:
# Análisis Exploratorio de Datos (EDA)
class EDA:
    def __init__(self, file=None):
        """
        Inicializa la clase EDA y carga datos desde un archivo CSV si se proporciona.

        Parámetros:
            file (str): Ruta al archivo CSV. Si no se proporciona, se inicializa un DataFrame vacío.
        """
        self.__df = pd.read_csv(file) if file else pd.DataFrame()

    def head_df(self, n=5):
        return self.__df.head(n) if not self.__df.empty else "No se cargaron los datos :("

    def tail_df(self, n=5):
        return self.__df.tail(n) if not self.__df.empty else "No se cargaron los datos :("

    def check_data_types(self):
        return self.__df.dtypes

    def drop_irrelevant_columns(self, columns):
        self.__df.drop(columns=columns, inplace=True)

    def drop_missing_values(self):
        self.__df.dropna(inplace=True)

    def detect_outliers(self):
        num_df = self.__df.select_dtypes(include=['float64', 'int64'])
        if num_df.empty:
            return "No hay columnas numéricas en el DataFrame."

        Q1 = num_df.quantile(0.25)
        Q3 = num_df.quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((num_df < (Q1 - 1.5 * IQR)) | (num_df > (Q3 + 1.5 * IQR))).sum()
        Dicc_outliers = {col: outliers[col] for col in num_df.columns if outliers[col] > 0}

        return Dicc_outliers if Dicc_outliers else "No se detectaron valores atípicos en las columnas numéricas."

    def plot_scatter(self, col1, col2):
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=self.__df[col1], y=self.__df[col2])
        plt.title(f'Gráfico de Dispersión: {col1} vs {col2}')
        plt.xlabel(col1)
        plt.ylabel(col2)
        plt.grid()
        plt.show()

    def plot_histogram(self, col):
        plt.figure(figsize=(10, 6))
        sns.histplot(self.__df[col], kde=True)
        plt.title(f'Histograma de {col}')
        plt.xlabel(col)
        plt.ylabel('Frecuencia')
        plt.show()

    def plot_heatmap(self):
        num_df = self.__df.select_dtypes(include=['float64', 'int64'])
        if num_df.empty:
            return "No hay columnas numéricas para generar el mapa de calor."

        num_df = num_df.loc[:, num_df.apply(lambda x: np.std(x) > 0.01)]

        plt.figure(figsize=(12, 10))
        sns.heatmap(num_df.corr(), cmap="coolwarm", annot= True, linewidths=0.5, cbar=True) #annot=False es para que no se vean los numeros en los cuadros
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.title("Correlation heatmap", fontsize=18)
        plt.ion()

    def __str__(self):
        return f"Clase EDA - DataFrame de la forma: {self.__df.shape}"

    def get_df(self):
        """Devuelve una copia del df para que las familias de los algoritmos las utilicen"""
        return self.__df.copy()

# Data Optimization

In [10]:
class DataOptimization(EDA):
    def __init__(self, datos_eda):
        """
        Use the processed DataFrame from EDA to optimize models.

        Parameters:
        - datos_eda: This is the processed DataFrame from the EDA class.
        """
        self.__df = datos_eda.get_df()
        
        # Data components
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        
        # Regression models
        self.regression_models = {
            'LinearRegression': LinearRegression(),
            'SVM': SVR(),
            'Ridge': Ridge(),
            'DecisionTreeRegressor': DecisionTreeRegressor(),
            'RandomForestRegressor': RandomForestRegressor(),
            'GradientBoostingRegressor': GradientBoostingRegressor(),
            'XGBRegressor': XGBRegressor(random_state=42)
        }

        # Classification models
        self.classification_models = {
            'DecisionTreeClassifier': DecisionTreeClassifier(),
            'KNeighborsClassifier': KNeighborsClassifier(),
            'RandomForestClassifier': RandomForestClassifier(),
            'AdaBoostClassifier': AdaBoostClassifier(random_state=42)
        }
        
        # Current active models based on problem type
        self.models = None
        
        # Parameter grids
        self.param_grids_genetic = None
        self.param_grids_exhaustive = None

#------------------------Data Split Components--------------------------------------------------------------

    def split_df(self, target_column, test_size=None, random_state=42):
        """
        Splits the dataframe into training and test sets.

        Parameters:
        - target_column: str -> Name of the target column (y).
        - test_size: float -> Proportion of the test set (if not provided, it is calculated from the entered percentage).
        - random_state: int -> Seed for randomization.

        Returns:
        - X_train, X_test, y_train, y_test: Split and preprocessed datasets.
        """
        while True:
            try:
                percent = float(input("Enter the percentage for the training set: (Example: 80) \n"))
                if 0 < percent < 100:
                    train_size = percent / 100
                    break # Exit the loop
                else:
                    print("The percentage must be between 1 and 99.")
            except ValueError:
                print("Invalid number. Try again.")

        while True:
            try:
                # Separate features (X) and target variable (y)
                X = self.__df.drop(columns=[target_column])
                y = self.__df[target_column]
                break  # Exit the loop if there are no errors
            except KeyError:
                print(f"The column '{target_column}' does not exist. Try again.")
                print("Available columns:")
                print(self.check_data_types())
                target_column = input("Enter the correct name of the target column: ")

        # Preprocess features (X), convert categorical variables to One-Hot Encoding
        import pandas as pd
        X = pd.get_dummies(X, drop_first=True)

        # Check if the target variable (y) is categorical and needs encoding
        if y.dtypes == 'object' or y.dtypes.name == 'category':
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            y = le.fit_transform(y)

        # Perform the split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size= 1 - train_size, random_state=random_state
        )

        print(f"Data split:\n- Training: {X_train.shape[0]} rows\n- Test: {X_test.shape[0]} rows")
        return X_train, X_test, y_train, y_test
    
#------------------------Parameter Grid for Regression--------------------------------------------------------------

    def _get_param_grids_regression_genetic(self):
        return {
            'LinearRegression': {
                "clf__copy_X": Categorical([True, False]),
                "clf__fit_intercept": Categorical([True, False]),
                "clf__positive": Categorical([True, False])
            },
            'SVM': {
                'clf__C': Continuous(0.1, 10.0),
                'clf__kernel': Categorical(['linear', 'poly', 'rbf']),
                'clf__gamma': Categorical(['scale', 'auto'])
            },
            'Ridge': {
                'clf__alpha': Continuous(0.1, 10.0),
                'clf__fit_intercept': Categorical([True, False]),
                'clf__solver': Categorical(['auto', 'svd', 'cholesky'])
            },
            'DecisionTreeRegressor': {
                'clf__max_depth': Integer(3, 10),
                'clf__min_samples_split': Integer(2, 10),
                'clf__min_samples_leaf': Integer(1, 5)
            },
            'RandomForestRegressor': {
                'clf__n_estimators': Integer(50, 200),
                'clf__max_depth': Integer(5, 15),
                'clf__min_samples_split': Integer(2, 10)
            },
            'GradientBoostingRegressor': {
                'clf__n_estimators': Integer(50, 200),
                'clf__learning_rate': Continuous(0.01, 0.2),
                'clf__max_depth': Integer(3, 10)
            },
            'XGBRegressor': {
                'clf__learning_rate': Continuous(0.01, 0.2),
                'clf__n_estimators': Integer(50, 200),
                'clf__max_depth': Integer(3, 10),
                'clf__subsample': Continuous(0.7, 1.0)
            }
        }

    def _get_param_grids_regression_exhaustive(self):
        return {
            'LinearRegression': {
                "clf__copy_X": [True, False],
                "clf__fit_intercept": [True, False],
                "clf__positive": [True, False]
            },
            'SVM': {
                'clf__C': [0.1, 1, 10],
                'clf__kernel': ['linear', 'poly', 'rbf'],
                'clf__gamma': ['scale', 'auto']
            },
            'Ridge': {
                'clf__alpha': [0.1, 1.0, 10.0],
                'clf__fit_intercept': [True, False],
                'clf__solver': ['auto', 'svd', 'cholesky']
            },
            'DecisionTreeRegressor': {
                'clf__max_depth': [3, 5, 7, 10],
                'clf__min_samples_split': [2, 5, 10],
                'clf__min_samples_leaf': [1, 2, 5]
            },
            'RandomForestRegressor': {
                'clf__n_estimators': [50, 100, 200],
                'clf__max_depth': [5, 10, 15],
                'clf__min_samples_split': [2, 5, 10]
            },
            'GradientBoostingRegressor': {
                'clf__n_estimators': [50, 100, 200],
                'clf__learning_rate': [0.01, 0.1, 0.2],
                'clf__max_depth': [3, 5, 10]
            },
            'XGBRegressor': {
                'clf__learning_rate': [0.01, 0.1, 0.2],
                'clf__n_estimators': [50, 100, 200],
                'clf__max_depth': [3, 5, 10],
                'clf__subsample': [0.7, 0.8, 1.0]
            }
        }

#------------------------Parameter Grid for Classification--------------------------------------------------------------

    def _get_param_grids_classification_genetic(self):
        return {
            'DecisionTreeClassifier': {
                'clf__max_depth': Integer(3, 10),
                'clf__min_samples_split': Integer(2, 10),
                'clf__criterion': Categorical(['gini', 'entropy'])
            },
            'KNeighborsClassifier': {
                'clf__n_neighbors': Integer(3, 15),
                'clf__weights': Categorical(['uniform', 'distance']),
                'clf__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree'])
            },
            'RandomForestClassifier': {
                'clf__n_estimators': Integer(50, 200),
                'clf__max_depth': Integer(5, 15),
                'clf__min_samples_split': Integer(2, 10)
            },
            'AdaBoostClassifier': {
                'clf__n_estimators': Integer(50, 200),
                'clf__learning_rate': Continuous(0.01, 0.2),
                'clf__algorithm': Categorical(['SAMME'])
            }
        }

    def _get_param_grids_classification_exhaustive(self):
        return {
            'DecisionTreeClassifier': {
                'clf__max_depth': [3, 5, 7, 10],
                'clf__min_samples_split': [2, 5, 10],
                'clf__criterion': ['gini', 'entropy']
            },
            'KNeighborsClassifier': {
                'clf__n_neighbors': [3, 5, 7, 10, 15],
                'clf__weights': ['uniform', 'distance'],
                'clf__algorithm': ['auto', 'ball_tree', 'kd_tree']
            },
            'RandomForestClassifier': {
                'clf__n_estimators': [50, 100, 200],
                'clf__max_depth': [5, 10, 15],
                'clf__min_samples_split': [2, 5, 10]
            },
            'AdaBoostClassifier': {
                'clf__n_estimators': [50, 100, 200],
                'clf__learning_rate': [0.01, 0.1, 0.2],
                'clf__algorithm': ['SAMME']
            }
        }

#------------------------Search Components--------------------------------------------------------------

    def genetic_search(self, scoring_metric):
        """
        Optimize models using genetic algorithms.
        
        Parameters:
        - scoring_metric: str -> Metric to use for evaluation ('neg_root_mean_squared_error' for regression, 'roc_auc' for classification)
        """
        if self.X_train is None or self.X_test is None:
            print("Error: You must run split_df() before calling genetic_search().")
            return

        results = {}
        
        # Feature selection based on current model type
        if 'Regressor' in list(self.models.keys())[0] or list(self.models.keys())[0] in ['LinearRegression', 'Lasso', 'Ridge']:
            # Regression feature selection
            lasso_cv = LassoCV(cv=5) 
            lasso_cv.fit(self.X_train, self.y_train)
            f_selection = SelectFromModel(lasso_cv)
        else:
            # Classification feature selection
            model_base = RandomForestClassifier(n_estimators=100, random_state=42)
            model_base.fit(self.X_train, self.y_train)
            f_selection = SelectFromModel(model_base)

        self.X_train = f_selection.transform(self.X_train)
        self.X_test = f_selection.transform(self.X_test)

        for name, model in self.models.items():
            pl = Pipeline([
              ('fs', f_selection), 
              ('clf', model), 
            ])            
            print(f"Training {name} with genetic method...")
            evolved_estimator = GASearchCV(
                estimator=pl,
                cv=5,
                scoring=scoring_metric,
                population_size=10,
                generations=5,
                tournament_size=3,
                elitism=True,
                crossover_probability=0.8,
                mutation_probability=0.1,
                param_grid=self.param_grids_genetic[name],
                algorithm="eaSimple",
                n_jobs=-1,
                error_score='raise',
                verbose=True
            )
            evolved_estimator.fit(self.X_train, self.y_train)
            results[name] = {
                'best_params': evolved_estimator.best_params_,
                'estimator': evolved_estimator.best_estimator_
            }
        return results

    def exhaustive_search(self, scoring_metric):
        """
        Perform exhaustive grid search for hyperparameter optimization.
        
        Parameters:
        - scoring_metric: str -> Metric to use for evaluation ('neg_root_mean_squared_error' for regression, 'roc_auc' for classification)
        """
        results = {}
        
        # Feature selection based on current model type
        if 'Regressor' in list(self.models.keys())[0] or list(self.models.keys())[0] in ['LinearRegression', 'Lasso', 'Ridge']:
            # Regression feature selection
            lasso_cv = LassoCV(cv=5) 
            lasso_cv.fit(self.X_train, self.y_train)
            f_selection = SelectFromModel(lasso_cv)
        else:
            # Classification feature selection
            model_base = RandomForestClassifier(n_estimators=100, random_state=42)
            model_base.fit(self.X_train, self.y_train)
            f_selection = SelectFromModel(model_base)
            
        self.X_train = f_selection.transform(self.X_train)
        self.X_test = f_selection.transform(self.X_test)
        
        for name, model in self.models.items():
            pl = Pipeline([
              ('clf', model), 
            ])
            print(f"Training {name} with exhaustive method...")
            grid_search = GridSearchCV(
                estimator=pl,
                param_grid=self.param_grids_exhaustive[name],
                cv=5,
                scoring=scoring_metric,
                n_jobs=-1,
                verbose=1
            )
            grid_search.fit(self.X_train, self.y_train)
            results[name] = {
                'best_params': grid_search.best_params_,
                'estimator': grid_search.best_estimator_
            }
        return results
    
#------------------------Director Function--------------------------------------------------------------
    def opti_director(self, target_column, problem_type='regression', method='both', random_state=42):
        """
        This method orchestrates the optimization process for every model in this class.
        1. Make the data split
        2. Performs the optimization of models (genetic, exhaustive or both)
        3. Extract the best parameters in a clean format to use them in the models.
        
        Parameters:
        - target_column: str -> Name of the target column (y).
        - problem_type: str -> Type of problem ('regression' or 'classification').
        - method: str -> What optimization method is going to be used ('genetic', 'exhaustive', or 'both').
        - random_state: int -> Random seed for reproducibility.
        
        Returns:
        - dict -> Keeps the best parameters for each model.
        """
        # Set up models and parameter grids based on problem type
        if problem_type.lower() == 'regression':
            self.models = self.regression_models
            self.param_grids_genetic = self._get_param_grids_regression_genetic()
            self.param_grids_exhaustive = self._get_param_grids_regression_exhaustive()
            # RMSE for regression
            scoring_metric = 'neg_root_mean_squared_error'  
        elif problem_type.lower() == 'classification':
            self.models = self.classification_models
            self.param_grids_genetic = self._get_param_grids_classification_genetic()
            self.param_grids_exhaustive = self._get_param_grids_classification_exhaustive()
            # AUC for classification
            scoring_metric = 'roc_auc'  
        else:
            raise ValueError("problem_type must be 'regression' or 'classification'")
        
        # 1. Make the data split
        self.X_train, self.X_test, self.y_train, self.y_test = self.split_df(
            target_column=target_column,
            random_state=random_state
        )
        
        # 2. Performs the optimization of models (genetic, exhaustive or both)
        best_params = {}
        
        if method.lower() == 'genetic' or method.lower() == 'both':
            genetic_results = self.genetic_search(scoring_metric)
            
            # 3. Extract the best parameters in a clean format
            clean_genetic_params = {}
            for model_name, model_result in genetic_results.items():
                best_params_model = model_result['best_params']
                model_params = {param.replace('clf__', ''): value for param, value in best_params_model.items()}
                clean_genetic_params[model_name] = model_params
            
            best_params['genetic'] = clean_genetic_params
            
        if method.lower() == 'exhaustive' or method.lower() == 'both':
            exhaustive_results = self.exhaustive_search(scoring_metric)
            
            # 3. Extract the best parameters in a clean format
            clean_exhaustive_params = {}
            for model_name, model_result in exhaustive_results.items():
                best_params_model = model_result['best_params']
                model_params = {param.replace('clf__', ''): value for param, value in best_params_model.items()}
                clean_exhaustive_params[model_name] = model_params
            
            best_params['exhaustive'] = clean_exhaustive_params
            
        return best_params

In [13]:
# MI TESTING
#Recuerda el input arriba en VSCode
archivo_csv = "../dataset/dataset.csv"
eda = EDA(file=archivo_csv)
print(eda.head_df())
optimizador = DataOptimization(eda)

best_params = optimizador.opti_director(
    target_column='Price',
    problem_type='regression',
    method='both',             
)

   Id  Year  Kilometers_Driven  Mileage  Engine   Power  Seats  Price  \
0   0  2010              72000    26.60   998.0   58.16    5.0   1.75   
1   1  2015              41000    19.67  1582.0  126.20    5.0  12.50   
2   2  2011              46000    18.20  1199.0   88.70    5.0   4.50   
3   3  2012              87000    20.77  1248.0   88.76    7.0   6.00   
4   4  2013              40670    15.20  1968.0  140.80    5.0  17.74   

   Fuel_Type_CNG  Fuel_Type_Diesel  ...  Fuel_Type_LPG  Fuel_Type_Petrol  \
0              1                 0  ...              0                 0   
1              0                 1  ...              0                 0   
2              0                 0  ...              0                 1   
3              0                 1  ...              0                 0   
4              0                 1  ...              0                 0   

   Owner_Type_First  Owner_Type_Fourth & Above  Owner_Type_Second  \
0                 1                

Data split:
- Training: 2056 rows
- Test: 882 rows
Training LinearRegression with genetic method...




ValueError: Found array with 0 feature(s) (shape=(1645, 0)) while a minimum of 1 is required by LassoCV.

# Models

## Unsupervised

In [14]:
class NoSupervisado(EDA):
    def __init__(self, datos_eda):
        # La clase ya utiliza el df procesado en la clase EDA
        df = datos_eda.get_df()
        super().__init__()
        self.__df = df

    @property
    def df(self):
        return self.__df

    @df.setter
    def df(self, p_df):
        self.__df = p_df

    def __byebye_object_values(self):
        # Elimina columnas de tipo 'object'
        self.__df = self.__df.select_dtypes(exclude=['object'])

    def calcular_metricas(self, labels):
        """
        Calcula métricas de evaluación para clustering.
        """
        data = self.__df.dropna()
        data = (data - data.mean()) / data.std()
        metrics = {
            "Índice de Silueta": silhouette_score(data, labels),
            "Calinski-Harabasz": calinski_harabasz_score(data, labels),
            "Davies-Bouldin": davies_bouldin_score(data, labels)
        }
        return metrics

    def kmeans(self, n_clusters):
        self.__byebye_object_values()
        data = self.__df
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        labels = kmeans.fit_predict(data)
        metrics = self.calcular_metricas(labels)
        print(f"Métricas para K-Means (n_clusters={n_clusters}): {metrics}")
        return metrics

    def k_medoids(self, n_clusters, metric='euclidean'):
        self.__byebye_object_values()
        data = self.__df
        
        # Convertir a numpy array si aún no lo es
        data_array = np.array(data)
        
        # Inicialización de medoides (seleccionar índices aleatorios)
        np.random.seed(42)  # Para reproducibilidad
        initial_medoids = np.random.choice(len(data_array), n_clusters, replace=False).tolist()
        
        # Crear y ejecutar el algoritmo KMedoids
        kmedoids_instance = kmedoids(data_array, initial_medoids)
        kmedoids_instance.process()
        
        # Obtener clusters y medoides
        clusters = kmedoids_instance.get_clusters()  # Lista de listas de índices
        medoids = kmedoids_instance.get_medoids()    # Lista de índices de medoides
        
        # Crear etiquetas en formato sklearn (un número para cada punto)
        labels = np.zeros(len(data_array), dtype=int)
        for cluster_idx, cluster in enumerate(clusters):
            for point_idx in cluster:
                labels[point_idx] = cluster_idx
        
        # Calcular métricas
        metrics = self.calcular_metricas(labels)
        print(f"Métricas para K-Medoids (n_clusters={n_clusters}, metric={metric}): {metrics}")
        
        return metrics

    def hac(self, n_clusters=3, method='ward'):
        self.__byebye_object_values()
        data = self.__df
        linkage_matrix = linkage(data, method=method)
        labels = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust')
        metrics = self.calcular_metricas(labels)
        print(f"Métricas para HAC (n_clusters={n_clusters}, method={method}): {metrics}")
        return metrics

    def umap_model(self, n_components=2, n_neighbors=15):
        self.__byebye_object_values()
        data = self.__df
        modelo_umap = UMAP(n_components=n_components, n_neighbors=n_neighbors)
        components = modelo_umap.fit_transform(data)
        kmeans = KMeans(n_clusters=3, random_state=42)
        labels = kmeans.fit_predict(components)
        metrics = self.calcular_metricas(labels)
        print(f"Métricas para UMAP (n_components={n_components}, n_neighbors={n_neighbors}): {metrics}")
        return metrics

    def comparar_algoritmos(self, n_clusters):

        if self.__df.isnull().any().any():
          print("El DataFrame contiene valores nulos. Se eliminarán automáticamente para continuar.")
          self.__df.dropna(inplace=True)

        print("\nEjecutando K-Means...")
        kmeans_metrics = self.kmeans(n_clusters)

        print("\nEjecutando K-Medoids...")
        kmedoids_metrics = self.k_medoids(n_clusters)

        print("\nEjecutando HAC...")
        hac_metrics = self.hac(n_clusters=n_clusters)

        print("\nEjecutando UMAP...")
        umap_metrics = self.umap_model(n_components=2, n_neighbors=15)

        resultados = pd.DataFrame({
            "K-Means": kmeans_metrics,
            "K-Medoids": kmedoids_metrics,
            "HAC": hac_metrics,
            "UMAP": umap_metrics
        }).T

        print("\nComparación de Algoritmos:")
        print(resultados)
        return resultados

## Supervised 

In [None]:
class Supervisado:
    def __init__(self, data_optimization):
        """
        Initialize the Supervisado class with data and optimized parameters from DataOptimization
        
        Parameters:
        - data_optimization: Instance of DataOptimization with optimized models and data
        """
        # Get the processed dataframe
        self.__df = data_optimization.X_train  # Store a reference to the training data
        
        # Store the data splits from DataOptimization
        self.X_train = data_optimization.X_train
        self.X_test = data_optimization.X_test
        self.y_train = data_optimization.y_train
        self.y_test = data_optimization.y_test
        
        # Store the best parameters from optimization if available
        self.best_params = None
        
        # Problem type (regression or classification)
        self.problem_type = 'regression' if 'Regressor' in list(data_optimization.models.keys())[0] or list(data_optimization.models.keys())[0] in ['LinearRegression', 'Lasso', 'Ridge'] else 'classification'

    @property
    def df(self):
        return self.__df

    @df.setter
    def df(self, p_df):
        self.__df = p_df

    def set_optimized_parameters(self, best_params, method='genetic'):
        """
        Set the best parameters from the optimization process
        
        Parameters:
        - best_params: Dictionary with best parameters for each model
        - method: Which optimization method to use ('genetic' or 'exhaustive')
        """
        if method in best_params:
            self.best_params = best_params[method]
            print(f"Using {method} optimization parameters")
        else:
            print(f"Warning: {method} parameters not found. Using default parameters.")
            self.best_params = {}

#-----------------Evaluacion de modelos----------------------------
    def calcular_metricas(self, modelo, X_test, y_test, predicciones, modelo_nombre):
      """
      Calculate the model evaluation metrics and save the results in a dictionary.

      Parameters:
      - modelo: The model in use
      - X_test: Test features
      - y_test: Test labels
      - predicciones: Model predictions
      - modelo_nombre: Model name

      Returns:
      - resultados: Dictionary containing evaluation metrics.
      """

      mse = mean_squared_error(y_test, predicciones)
      r2 = r2_score(y_test, predicciones)
      mae = mean_absolute_error(y_test, predicciones)
      rmse = np.sqrt(mse)
      tolerancia = 0.1  # 10% of tolerance
      precision_global = np.mean(np.abs(y_test - predicciones) <= (tolerancia * y_test)) * 100

      resultados = {
          'modelo': modelo_nombre,
          'MSE': mse,
          'R2': r2,
          'MAE': mae,
          'RMSE': rmse,
          'precision_global': precision_global,
          #'predicciones': predicciones.tolist(),
          #'valores_reales': y_test.tolist()
      }
      return resultados

    def calcular_metricas_clasificacion(self, modelo, X_test, y_test, predicciones, modelo_nombre):
      """
      Calculate evaluation metrics for classification models and store the results in a dictionary.

      Parameters:
      - y_test: True labels of the test dataset.
      - predicciones: Predicted labels from the classification model.
      - modelo_nombre: Name or identifier of the evaluated model.

      Returns:
      - resultados: Dictionary containing evaluation metrics.
      """
      from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

      accuracy = accuracy_score(y_test, predicciones)
      precision = precision_score(y_test, predicciones, average='weighted')
      recall = recall_score(y_test, predicciones, average='weighted')
      f1 = f1_score(y_test, predicciones, average='weighted')

      resultados = {
          'modelo': modelo_nombre,
          'accuracy': accuracy,
          'precision': precision,
          'recall': recall,
          'f1_score': f1
      }
      return resultados

#------------------------Regression Models--------------------------------------------------------------

    def regre_lineal_simple(self, X_train, X_test, y_train, y_test):
      """
      Performs Simple Linear Regression and computes multiple performance metrics.

      Parameters:
      - X_train: Features used for training the regression model.
      - y_train: Target variable used for training.
      - X_test: Features used to evaluate the regression model.
      - y_test: Actual target values to compare against predictions.
      - modelo_nombre: Name or identifier of the evaluated regression model.

      Returns: Dictionary containing regression performance metrics (model name, MSE, RMSE, MAE, R² score).
      """
      print("Starting Simple Linear Regression...")
      # Get optimized parameters if available
      params = self.best_params.get('LinearRegression', {}) if self.best_params else {}
      
      # Create model with optimized parameters
      modelo = LinearRegression(**params)
      modelo.fit(self.X_train, self.y_train)
      predicciones = modelo.predict(self.X_test)
      return self.calcular_metricas(modelo, X_test, y_test, predicciones, 'Regresión Lineal Simple')

    def regre_svm(self, X_train, X_test, y_train, y_test):
      """
      Realiza una Support Vector Machine y calcula múltiples métricas de rendimiento.
      Returns:
      - resultados: Diccionario con métricas de rendimiento del modelo
      """
      print("Iniciando Support Vector Machine (SVM)...")
      from sklearn.preprocessing import StandardScaler

      # Escalar los datos
      scaler = StandardScaler()
      X_train_scaled = scaler.fit_transform(X_train)
      X_test_scaled = scaler.transform(X_test)

      modelo = SVR(kernel='rbf', C=1.0, gamma='scale')
      modelo.fit(X_train_scaled, y_train)
      predicciones = modelo.predict(X_test_scaled)

      return self.calcular_metricas(modelo, X_test_scaled, y_test, predicciones, 'Support Vector Machine')


    def regre_regridge(self, X_train, X_test, y_train, y_test):
      """
      Realiza un Regresión Ridge y calcula múltiples métricas de rendimiento.
      Returns:
      - resultados: Diccionario con métricas de rendimiento del modelo
      """
      print("Iniciando Regresión Ridge...")
      modelo = Ridge(alpha = 1.0)
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)
      return self.calcular_metricas(modelo, X_test, y_test, predicciones, 'Regresión Ridge')

    def regre_decisionTree(self, X_train, X_test, y_train, y_test):
      """
      Realiza un Decision Tree Regressor y calcula múltiples métricas de rendimiento.
      Returns:
      - resultados: Diccionario con métricas de rendimiento del modelo
      """
      print("Iniciando DecisionTreeRegressor..")
      modelo = DecisionTreeRegressor(random_state=0)
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)
      return self.calcular_metricas(modelo, X_test, y_test, predicciones, 'Decision Tree Regressor')

    def regre_randomforest(self, X_train, X_test, y_train, y_test):
      """
      Realiza un Random Forest Regressor y calcula múltiples métricas de rendimiento.
      Returns:
      - resultados: Diccionario con métricas de rendimiento del modelo
      """
      print("Iniciando RandomForest Regressor..")
      modelo = RandomForestRegressor(max_depth=2, random_state=0)
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)
      return self.calcular_metricas(modelo, X_test, y_test, predicciones, 'Random Forest Regressor')

    def regre_gradient_boosting(self, X_train, X_test, y_train, y_test):
      """
      Realiza un Grandient Boostsing Regressor y calcula múltiples métricas de rendimiento.
      Returns:
      - resultados: Diccionario con métricas de rendimiento del modelo
      """
      print("Iniciando Grandient Boostsing Regressor..")
      modelo = GradientBoostingRegressor(random_state=0)
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)
      return self.calcular_metricas(modelo, X_test, y_test, predicciones, 'Grandient Boostsing Regressor')

    def regre_xgboost(self, X_train, X_test, y_train, y_test):
      """
      Realiza un XGBoost Regressor y calcula múltiples métricas de rendimiento.
      Returns:
      - resultados: Diccionario con métricas de rendimiento del modelo
      """
      print("Iniciando XGBoost Regressor..")
      modelo = XGBRegressor(random_state=0)
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)
      return self.calcular_metricas(modelo, X_test, y_test, predicciones, 'XGBoost Regressor')

#------------------------Classification Models--------------------------------------------------------------

    def classi_decision_tree(self, X_train, X_test, y_train, y_test):
      """
      Realiza un modelo de clasificación usando Árbol de Decisión y calcula métricas de rendimiento.
      """
      print("Iniciando Decision Tree Classifier...")
      from sklearn.tree import DecisionTreeClassifier

      modelo = DecisionTreeClassifier(random_state=0)
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)

      return self.calcular_metricas_clasificacion(modelo, X_test, y_test, predicciones, 'Decision Tree')

    def classi_knn(self, X_train, X_test, y_train, y_test):
      """
      Realiza un modelo de clasificación usando K-Nearest Neighbors y calcula métricas de rendimiento.
      """
      print("Iniciando K-Nearest Neighbors Classifier...")
      from sklearn.neighbors import KNeighborsClassifier

      modelo = KNeighborsClassifier()
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)

      return self.calcular_metricas_clasificacion(modelo, X_test, y_test, predicciones, 'K-Nearest Neighbors')

    def classi_random_forest(self, X_train, X_test, y_train, y_test):
      """
      Realiza un modelo de clasificación usando Random Forest y calcula métricas de rendimiento.
      """
      print("Iniciando Random Forest Classifier...")
      from sklearn.ensemble import RandomForestClassifier

      modelo = RandomForestClassifier(random_state=0)
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)

      return self.calcular_metricas_clasificacion(modelo, X_test, y_test, predicciones, 'Random Forest')

    def classi_adaboost(self, X_train, X_test, y_train, y_test):
      """
      Realiza un modelo de clasificación usando AdaBoost y calcula métricas de rendimiento.
      """
      print("Iniciando AdaBoost Classifier...")
      from sklearn.ensemble import AdaBoostClassifier

      modelo = AdaBoostClassifier(random_state=0)
      modelo.fit(X_train, y_train)
      predicciones = modelo.predict(X_test)

      return self.calcular_metricas_clasificacion(modelo, X_test, y_test, predicciones, 'AdaBoost')

In [None]:
class Start:
    def __init__(self):
        self.eda = None
        self.supervisado = None
        self.no_supervisado = None
        self.split_data = None


    def mostrar_menu(self):
        while True:
            print("\n--- Menú Principal ---")
            print("1. 📁 Carga de datos en formato CSV y completar EDA")
            print("2. 🪐 Ejecutar modelo")
            print("3. 🛑 Salir")
            opcion = input("Seleccione una opción: ")

            if opcion == "1":
                self.datos_eda()
            elif opcion == "2":
                self.models_menu()
            elif opcion == "3":
                print("Saliendo del programa...")
                break
            else:
                print("Opción no válida. Intente de nuevo.")

    def models_menu(self):
      while True:
        print("\n--- ¿Qué problema necesita resolver? ---")
        print("1. 🔍 Clasificación: Asigne etiquetas a sus datos")
        print("2. 📈 Regresión: Prediga valores continuos")
        print("3. 🧩 Aprendizaje No Supervisado: Descubra patrones ocultos")
        print("4. 🛑 Volver al menú principal")
        opcion = input("Seleccione una opción: ")

        # Resetear split_data antes de cambiar de modelo
        self.split_data = None
        self.supervisado = None

        if opcion == "1":
            self.classi_modelos()
        elif opcion == "2":
            self.regre_modelos()
        elif opcion == "3":
                if self.eda and not self.eda.get_df().empty:
                  self.no_supervisado = NoSupervisado(self.eda)
                  n_clusters = int(input("Ingrese el número de clusters: "))
                  self.no_supervisado.comparar_algoritmos(n_clusters=n_clusters)
                else:
                    print("Primero cargue los datos para poder realizar aprendizaje no supervisado.")
        elif opcion == "4":
            print("Saliendo del programa...")
            break
        else:
            print("Opción no válida. Intente de nuevo.")

    def datos_eda(self):
        while True:
            print("\n ----EDA----")
            print("1. 📂 Carga de datos")
            print("2. 🔍 Mostrar head del DataFrame")
            print("3. 📊 Revisar los tipos de datos")
            print("4. ✂️ Eliminar columnas")
            print("5. 🧹 Eliminar valores NULOS")
            print("6. ⚠️ Detectar valores atipicos")
            print("7. 📈 Graficar relación entre dos variables")
            print("8. 📉 Graficar histograma")
            print("9. 🌡 HeatMap: Generar mapa de calor")
            print("0. 🛑 Volver al menú principal")
            opcion = input("Seleccione una opción: ")

            if opcion == "1":
                my_data = input("¿Cómo se llama el CSV? ")
                try:
                    self.eda = EDA(file=my_data)
                    print("Instancia de EDA creada y datos cargados exitosamente.")
                except Exception as e:
                    print(f"Error al cargar los datos: {e}")
            elif opcion == "2":
                if self.eda:
                    print(self.eda.head_df())
                else:
                    print("Primero cargue los datos.")
            elif opcion == "3":
                if self.eda:
                    print(self.eda.check_data_types())
                else:
                    print("Primero cargue los datos.")
            elif opcion == "4":
                if self.eda:
                    columnas = input("Ingrese los nombres de las columnas a eliminar, separadas por comas: ").split(',')
                    columnas = [col.strip() for col in columnas]
                    try:
                        self.eda.drop_irrelevant_columns(columnas)
                        print(f"Columnas eliminadas: {', '.join(columnas)}")
                    except Exception as e:
                        print(f"Error al eliminar columnas: {e}")
                else:
                    print("Primero cargue los datos.")
            elif opcion == "5":
                if self.eda:
                    self.eda.drop_missing_values()
                    print("Valores nulos eliminados.")
                else:
                    print("Primero cargue los datos.")
            elif opcion == "6":
                if self.eda:
                    print(self.eda.detect_outliers())
                else:
                    print("Primero cargue los datos.")
            elif opcion == "7":
                if self.eda:
                    print("\n ***Variables disponibles***")
                    print(self.eda.check_data_types())
                    col1 = input("Ingrese el nombre de la primera variable: ")
                    col2 = input("Ingrese el nombre de la segunda variable: ")
                    try:
                        self.eda.plot_scatter(col1, col2)
                    except Exception as e:
                        print(f"Error al graficar: {e}")
                        break
                else:
                    print("Primero cargue los datos.")
            elif opcion == "8":
                if self.eda:
                    print("\n ***Variables disponibles***")
                    print(self.eda.check_data_types())
                    histogram_col = input("Ingrese el nombre de una variable a graficar: ")
                    try:
                        self.eda.plot_histogram(histogram_col)
                    except Exception as e:
                        print(f"Error al graficar: {e}")
                else:
                    print("Primero cargue los datos.")
            elif opcion == "9":
                if self.eda:
                    self.eda.plot_heatmap()
                    print("El programa se detendrá después de mostrar el gráfico.")
                    exit()
                else:
                    print("Primero cargue los datos.")
            elif opcion == "0":
                break
            else:
                print("Opción no válida. Intente de nuevo.")

    def regre_modelos(self):
      if self.supervisado is None:
          if self.eda:
              self.supervisado = Supervisado(self.eda)
          else:
              print("Primero debe cargar los datos")
              return

      if self.split_data is None:
          print("\n ***Variables disponibles***")
          print(self.eda.check_data_types())
          target_column = input("\n Ingrese el nombre de la columna objetivo: ")
          self.split_data = self.supervisado.split_df(target_column)

      X_train, X_test, y_train, y_test = self.split_data

      modelos = [
          self.supervisado.regre_lineal_simple,
          self.supervisado.regre_svm,
          self.supervisado.regre_regridge,
          self.supervisado.regre_decisionTree,
          self.supervisado.regre_randomforest,
          self.supervisado.regre_gradient_boosting,
          self.supervisado.regre_xgboost
      ]

      resultados = []
      for modelo in modelos:
          resultados.append(modelo(X_train, X_test, y_train, y_test))

      print("\n--- Resultados del Benchmarking ---")
      for resultado in resultados:
          print(f"{resultado['modelo']}: R2={resultado['R2']:.4f}, RMSE={resultado['RMSE']:.4f}, MAE={resultado['MAE']:.4f}")

      # Opción de graficar resultados
      graficar = input("\n¿Desea graficar los resultados? (S/N): ").strip().upper()
      if graficar == 'S':
        # Preparar datos para la gráfica
        nombres_modelos = [resultado['modelo'] for resultado in resultados]
        r2_scores = [resultado['R2'] for resultado in resultados]
        rmse_scores = [resultado['RMSE'] for resultado in resultados]
        mae_scores = [resultado['MAE'] for resultado in resultados]

        # Crear la gráfica de barras comparativa
        plt.figure(figsize=(15, 6))

        # Gráfica de R2
        plt.subplot(1, 3, 1)
        bars1 = plt.bar(nombres_modelos, r2_scores)
        plt.title('R2 Scores')
        plt.xticks(rotation=45, ha='right')
        plt.ylabel('R2')
        # Añadir valores en las barras
        for bar in bars1:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.4f}',
                     ha='center', va='bottom', rotation=0)

        # Gráfica de RMSE
        plt.subplot(1, 3, 2)
        bars2 = plt.bar(nombres_modelos, rmse_scores)
        plt.title('Root Mean Squared Error')
        plt.xticks(rotation=45, ha='right')
        plt.ylabel('RMSE')
        # Añadir valores en las barras
        for bar in bars2:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.4f}',
                     ha='center', va='bottom', rotation=0)

        # Gráfica de MAE
        plt.subplot(1, 3, 3)
        bars3 = plt.bar(nombres_modelos, mae_scores)
        plt.title('Mean Absolute Error')
        plt.xticks(rotation=45, ha='right')
        plt.ylabel('MAE')
        # Añadir valores en las barras
        for bar in bars3:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.4f}',
                     ha='center', va='bottom', rotation=0)

        plt.tight_layout()
        plt.show()
        exit()

    def classi_modelos(self):
        if graficar == 'S':
          # Preparar datos para la gráfica
          nombres_modelos = [resultado['modelo'] for resultado in resultados]
          accuracy_scores = [resultado['accuracy'] for resultado in resultados]
          precision_scores = [resultado['precision'] for resultado in resultados]
          recall_scores = [resultado['recall'] for resultado in resultados]
          f1_scores = [resultado['f1_score'] for resultado in resultados]

          # Crear la gráfica de barras comparativa
          plt.figure(figsize=(15, 6))

          # Gráfica de Accuracy
          plt.subplot(1, 4, 1)
          bars1 = plt.bar(nombres_modelos, accuracy_scores)
          plt.title('Accuracy')
          plt.xticks(rotation=45, ha='right')
          plt.ylabel('Accuracy')
          # Añadir valores en las barras
          for bar in bars1:
              height = bar.get_height()
              plt.text(bar.get_x() + bar.get_width()/2., height,
                      f'{height:.4f}',
                      ha='center', va='bottom', rotation=0)

          # Gráfica de Precision
          plt.subplot(1, 4, 2)
          bars2 = plt.bar(nombres_modelos, precision_scores)
          plt.title('Precision')
          plt.xticks(rotation=45, ha='right')
          plt.ylabel('Precision')
          # Añadir valores en las barras
          for bar in bars2:
              height = bar.get_height()
              plt.text(bar.get_x() + bar.get_width()/2., height,
                      f'{height:.4f}',
                      ha='center', va='bottom', rotation=0)

          # Gráfica de Recall
          plt.subplot(1, 4, 3)
          bars3 = plt.bar(nombres_modelos, recall_scores)
          plt.title('Recall')
          plt.xticks(rotation=45, ha='right')
          plt.ylabel('Recall')
          # Añadir valores en las barras
          for bar in bars3:
              height = bar.get_height()
              plt.text(bar.get_x() + bar.get_width()/2., height,
                      f'{height:.4f}',
                      ha='center', va='bottom', rotation=0)

          # Gráfica de F1-Score
          plt.subplot(1, 4, 4)
          bars4 = plt.bar(nombres_modelos, f1_scores)
          plt.title('F1-Score')
          plt.xticks(rotation=45, ha='right')
          plt.ylabel('F1-Score')
          # Añadir valores en las barras
          for bar in bars4:
              height = bar.get_height()
              plt.text(bar.get_x() + bar.get_width()/2., height,
                      f'{height:.4f}',
                      ha='center', va='bottom', rotation=0)

          plt.tight_layout()
          plt.show()
          exit()

Ejecucion del programa con estructura pythonica

In [None]:
# Ejecución del menú principal
if __name__ == "__main__":
    start = Start()
    start.mostrar_menu()