# Setup

In [1]:
import os
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
import mlflow
import mlflow.sklearn
from sklearn.metrics import ConfusionMatrixDisplay 
from matplotlib import pyplot as plt
from mlflow.data.pandas_dataset import PandasDataset

#importar clase regresion logistica
from sklearn.linear_model import LogisticRegression

#Importar clase arbol de decisión 
from sklearn.tree import DecisionTreeClassifier


import yaml

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Loading and exploring the data

In [4]:
def fetch_data(ID):
    original_ds = fetch_ucirepo(id=ID)
    df = pd.concat([original_ds.data.features,original_ds.data.targets], axis = 1)
    return df

def explore_data(data):
    print(data.head().T)
    print(data.describe())
    print(data.info()) 

# Visualizing the data

In [None]:
def plot_histograms(data):
    data.hist(bins=15, figsize=(15, 10))
    plt.show()

def plot_correlation_matrix(data):
    plt.figure(figsize=(12, 8))
    sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap='coolwarm')
    plt.show()
    
def plot_feature_relationships(data, target):
    for column in data.columns[:-1]:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=target, y=column, data=data)
        plt.title(f'Relationship between Cervical cancer and {column}')
        plt.show()

# Preprocessing and feature engineering

In [None]:
def delete_outliers(data):
    numeric_columns = data.select_dtypes(include='number').columns
    # calcualr el IQR para cada columna
    Q1 = data[numeric_columns].quantile(0.25)
    Q3 = data[numeric_columns].quantile(0.75)
    IQR = Q3 - Q1

    # calcula los limites inferiores y superiores
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # se determina una df the valores booleanos: True si es un outlier, False si no
    outliers = ((data[numeric_columns] < lower_bound) | (data[numeric_columns] > upper_bound))

    #se usa el df de booleanos para filtrar los outliers
    cleaned_data = data[~outliers.any(axis=1)]
    return cleaned_data

def normalization(data):
    skew = data.skew()
    log_transform_columns = []
    sqrt_transform_columns = []
    for index, value in skew.items():
        if value <= -1: 
            log_transform_columns.append(index)
        elif value > -1 and value < -0.5:
            sqrt_transform_columns.append(index)

    normalized = data.copy()

    # Transformación logarítmica: Reduce el impacto de valores extremos. Ideal para variables con sesgo positivo.

    normalized[log_transform_columns] = normalized[log_transform_columns].apply(np.log1p)

    # Transformación de raíz cuadrada: Similar a la logarítmica, pero menos agresiva. Funciona bien para variables con valores más pequeños o negativos con sesgo positivo o negativo.
    normalized[sqrt_transform_columns] = normalized[sqrt_transform_columns].apply(np.sqrt)

    return normalized

# Splitting the dataset

In [None]:
def split_data(data, target, test_size=0.2, random_state=42):
    X = data.drop(target, axis=1)
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Training the model

# Evaluating the model

# Cross Validation