<a href="https://colab.research.google.com/github/XescLlabres/tfgFLM/blob/main/HUSE_per_masstum/MASSTUMORALS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('df_sense_nan.csv')

In [None]:
# Identifica columnes amb menys de 20 valors únics
columnes_categoriques = [col for col in df.columns if df[col].nunique() < 10]
columnes_categoriques.append('Location Observation 1')
columnes_categoriques.append('ID')
columnes_categoriques.remove('Meld Original')


# Converteix aquestes columnes a categòriques
for col in columnes_categoriques:
    df[col] = df[col].astype('category')

# Identifica la resta de columnes (no categòriques)
columnes_numeriques = [col for col in df.columns if col not in columnes_categoriques]

# Converteix columnes numèriques de format amb comes a numèrics
for col in columnes_numeriques:
    # Substitueix comes per punts (només funciona per columnes tipus object o text)
    df[col] = df[col].astype(str).str.replace(',', '.')
    # Converteix a numèric
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Valors no vàlids es converteixen a NaN

for columna in df.columns:
    print(f"Columna: {columna}, Tipus: {df[columna].dtype}")

Columna: ID, Tipus: category
Columna: GENDER, Tipus: category
Columna: AGE_AT_TACE, Tipus: int64
Columna: DAYS_PRETACE, Tipus: int64
Columna: MAX_TM_DIAM, Tipus: float64
Columna: 7-11_CRITERIA, Tipus: float64
Columna: 7-11_CAT, Tipus: category
Columna: Weigh, Tipus: float64
Columna: heigh, Tipus: float64
Columna: BMI, Tipus: float64
Columna: BMI_category, Tipus: category
Columna: HCV, Tipus: category
Columna: Alcohol abuse, Tipus: category
Columna: Obesity, Tipus: category
Columna: Adquired and Inhereted disorders, Tipus: category
Columna: Aflatoxin, Tipus: category
Columna: Drug-abuse/addict, Tipus: category
Columna: Smoke, Tipus: category
Columna: no_active_ex, Tipus: category
Columna: Diabetes, Tipus: category
Columna: Hypertension, Tipus: category
Columna: Cancer History, Tipus: category
Columna: Active_cancer, Tipus: category
Columna: B-block treatment, Tipus: category
Columna: Statin treatment, Tipus: category
Columna: Antiretroviral treatment, Tipus: category
Columna: Cronic Kid

In [None]:
from sklearn.preprocessing import StandardScaler
# Identificar les columnes numèriques
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
print("Columnes numèriques:", numerical_columns)
df['7-11_CAT'] = df['7-11_CAT'].map({'LOW': 0, 'INTERMEDIATE': 1, 'HIGH': 2})
# Inicialitzar el StandardScaler
scaler = StandardScaler()

# Aplicar el scaler només a les columnes numèriques
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Comprova els resultats
print(df.head())

Columnes numèriques: Index(['AGE_AT_TACE', 'DAYS_PRETACE', 'MAX_TM_DIAM', '7-11_CRITERIA', 'Weigh',
       'heigh', 'BMI', 'ALBI Score', 'Meld Original', 'Meld-Na',
       'Glóbulos blancos', 'Glóbulos rojos', 'Hemoglobin (mg/dl)',
       'Hematocrito (%)', 'Plaquetas', 'Neutrófilos', 'Eosinófilos',
       'Basófilos', 'Monocitos', 'Linfocitos',
       'Ratio neutrófilos/linfocitos (NLR)', 'INR', 'Quick (%)', 'Sodio',
       'Potasio', 'Albumine g/L', 'Total Bilirrubine mg/dL',
       'Fosfatasa alcalina', 'GGT', 'ALT', 'AST', 'Glucosa', 'Urea',
       'Creatinine mg/dL', 'Alpha fetoprotein', 'Size 2D mm', 'Size 2D mm.1'],
      dtype='object')
      ID GENDER  AGE_AT_TACE  DAYS_PRETACE  MAX_TM_DIAM  7-11_CRITERIA  \
0  222.0      1     1.712184     -0.468331     1.352758       0.222472   
1  346.0      1     0.598105     -0.781814    -1.119573      -1.352067   
2  613.0      0    -1.731332     -0.050353    -0.078591      -0.136633   
3  613.2      0    -1.731332     -0.050353    -0.07

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

y = df.loc[:, "Viable"].values


X = df.loc[:, (df.columns != "Viable") & (df.columns != "ID")].values


# Crear el model base amb liblinear
model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)

# Definir els valors per a la cerca
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Crear l'objecte GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Ajustar el model
grid_search.fit(X, y)

# Resultats
print("Millor valor de C:", grid_search.best_params_['C'])
print("Millor puntuació AUC:", grid_search.best_score_)

Millor valor de C: 0.1
Millor puntuació AUC: 0.6822055137844611


In [None]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut
from imblearn.over_sampling import SMOTE


y = df.loc[:, "Viable"].values


X = df.loc[:, (df.columns != "Viable") & (df.columns != "ID")].values
feature_names = df.columns[(df.columns != "Viable") & (df.columns != "ID")]

loo = LeaveOneOut()
model = LogisticRegression(max_iter=1000, penalty='l1', solver='liblinear', C=0.1)  # LASSO con penalización L1

total_bootstrap_iterations = 1000
variable_counts = np.zeros(X.shape[1])

all_bootstrap_probabilities = []

# Aplicar LOOCV con Bootstrap
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


    bootstrapped_probabilities = []

    for _ in range(total_bootstrap_iterations):
        X_train_bootstrap, y_train_bootstrap = resample(X_train, y_train, replace=True)

        model.fit(X_train_bootstrap, y_train_bootstrap)

        prob = model.predict_proba(X_test)[:, 1][0]
        bootstrapped_probabilities.append(prob)

        # Actualizar el contador amb les variables seleccionades (coef != 0)
        variable_counts += model.coef_[0] != 0

    all_bootstrap_probabilities.append(bootstrapped_probabilities)

df_probabilities = pd.DataFrame(all_bootstrap_probabilities)
df_probabilities.to_csv('df_probabilities.csv', index=False)

variable_importance = pd.DataFrame({
    'Variable': feature_names,
    'Frequency (%)': (variable_counts / (total_bootstrap_iterations * loo.get_n_splits(X))) * 100  # Porcentaje de veces seleccionada
}).sort_values(by='Frequency (%)', ascending=False)


# Mostrar la importancia de les variables
df_probabilities.head()
pd.set_option('display.max_rows', None)
print("\nImportancia de las variables (Count y Frequency):")
print(variable_importance)


Importancia de las variables (Count y Frequency):
                              Variable  Frequency (%)
70                        Size 2D mm.1      94.168159
5                             7-11_CAT      89.703483
67              Location Observation 1      83.613433
50                             Potasio      82.261194
71                       LR M criteria      75.329353
1                          AGE_AT_TACE      73.469652
59                    Creatinine mg/dL      52.886567
17                        no_active_ex      49.964179
49                               Sodio      42.806965
68                             LI-RADS      36.668657
43                           Basófilos      35.006965
66                               N_OBS      26.362189
6                                Weigh      24.896020
9                         BMI_category      24.811443
56                                 AST      24.327861
55                                 ALT      22.759204
2                         DAYS_

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

In [None]:
taula1 = pd.read_csv('df_probabilities.csv')
taula2 = pd.read_csv('df_sense_nan.csv')

In [None]:

id_column = taula2.iloc[:, 0]

probabilities = taula1.iloc[:, 1:]
y_pred = (probabilities.mean(axis=1) >= 0.5).astype(int)

taula1_processed = pd.DataFrame({'ID': id_column, 'Viable': y_pred})


taula2_processed = taula2[['ID', 'Viable']]
taula2_processed.rename(columns={'Viable': 'y_true'}, inplace=True)

taula05 = pd.merge(taula1_processed, taula2_processed, on='ID')

print(taula05.head())

      ID  Viable  y_true
0  222.0       0     0.0
1  346.0       1     0.0
2  613.0       1     1.0
3  613.2       1     1.0
4  697.0       1     1.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  taula2_processed.rename(columns={'Viable': 'y_true'}, inplace=True)


In [None]:
y_pred = taula05['Viable']
y_true = taula05['y_true']

cm = confusion_matrix(y_true, y_pred)
print("Matriu de confusió:")
print(cm)

precision = precision_score(y_true, y_pred)
print(f"Precisión: {precision:.2f}")

recall = recall_score(y_true, y_pred)
print(f"Recall: {recall:.2f}")

f1 = f1_score(y_true, y_pred)
print(f"F1-Score: {f1:.2f}")

accuracy = accuracy_score(y_true, y_pred)
print(f"Exactitut: {accuracy:.2f}")

Matriz de Confusión:
[[52 44]
 [22 85]]
Precisión: 0.66
Recall: 0.79
F1-Score: 0.72
Exactitud: 0.67


In [None]:
df_data = pd.read_csv("df_sense_nan.csv")

df_probabilities = pd.read_csv("df_probabilities.csv")

df_data_subset = df_data[['ID', 'Viable']]

if len(df_data_subset) == len(df_probabilities):
    df_final = pd.concat([df_data_subset, df_probabilities], axis=1)

    df_final.to_csv("final_table.csv", index=False)

    print("Tabla final creada y guardada en 'final_table.csv'.")
else:
    print("Error: Los DataFrames tienen diferente número de filas. Verifica tus datos.")

Tabla final creada y guardada en 'final_table.csv'.


In [None]:
df = pd.read_csv('final_table.csv')

id_column = 'ID'
diagnosis_column = 'Viable'
prob_columns = df.columns[3:]

def calcular_intervalo(probabilidades, li, ls):
    lower_bound = np.percentile(probabilidades, li)
    upper_bound = np.percentile(probabilidades, ls)
    return lower_bound, upper_bound

In [None]:
intervalos_confianza = []

for index, row in df.iterrows():
    probabilidades = row[prob_columns].values
    lower, upper = calcular_intervalo(probabilidades, 2.5, 97.5)
    intervalos_confianza.append([row[id_column], row[diagnosis_column], lower, upper])

df_intervalos = pd.DataFrame(intervalos_confianza, columns=[id_column, diagnosis_column, 'lower_bound', 'upper_bound'])

In [None]:
def calcular_TP(a, b):
    TP = min(1 - a, b)
    return TP
def calcular_AP(a,b):
    AP = min(a, 1-b)
    return AP
def calcular_EP(a,b):
    EP = (b-a)
    return EP

In [None]:
df_intervalos['TP'] = df_intervalos.apply(lambda row: calcular_TP(row['lower_bound'], row['upper_bound']), axis=1)

df_intervalos['AP'] = df_intervalos.apply(lambda row: calcular_AP(row['lower_bound'], row['upper_bound']), axis=1)

df_intervalos['EP'] = df_intervalos.apply(lambda row: calcular_EP(row['lower_bound'], row['upper_bound']), axis=1)

In [None]:
medias = df_intervalos[['TP', 'AP', 'EP']].mean()
print("Medias:")
print(medias)

Medias:
TP    0.564444
AP    0.235506
EP    0.328938
dtype: float64
