# Limpiar bases de datos


---
### BASE DE DATOS - Kepler

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('keplerdata.csv')

# Quitamos los datos que no son atributos de los planetas u observaciones
df = df.drop(columns=['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_vet_stat', 'koi_vet_date', 'koi_pdisposition', 'koi_disp_prov', 'koi_comment', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec'])

# Quitamos candidatos dejando solo confirmados y falsos positivos
df = df[df['koi_disposition'].isin(['CONFIRMED', 'FALSE POSITIVE'])]

# Usamos one-hot encoding para koi_disposition
df['is_exoplanet'] = (df['koi_disposition'] == 'CONFIRMED').astype(int)
df = df.drop(columns=['koi_disposition'])

# Take out columns with object data types
for col, dtype in df.dtypes.items():
    if not np.issubdtype(dtype, np.number):
        df = df.drop(columns=col)


In [2]:
# Guardamos el dataframe limpio
df.to_csv('kepler_cleaned.csv', index=False)

---
### BASE DE DATOS - Tess

In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler

dt = pd.read_csv('tessdata.csv')

# Quitamos los datos que no son atributos de los planetas u observaciones
dt = dt.drop(columns=['rowid','toi','toipfx','tid','ctoi_alias','pl_pnum','rastr','ra','decstr','dec','st_pmra','st_pmdec','toi_created','rowupdate'])
#print(dt.tfopwg_disp.value_counts(),dt.shape)

# Eliminar todo lo que no nos diga si sí es o no  
dt = dt[dt['tfopwg_disp'].isin(['FA', 'FP', 'KP', 'CP'])]

# Convertimos FA y FP a NP, representando NO PLANETA; KP y CP a EP, representando ES PLANETA
dt['tfopwg_disp'] = dt['tfopwg_disp'].replace({'KP': 'EP', 'CP': 'EP', 'FA': 'NP', 'FP': 'NP'})

# Crear variable binaria (0 = NP, 1 = EP)
dt['is_exoplanet'] = (dt['tfopwg_disp'] == 'EP').astype(int)

# Eliminar la columna original si ya no la necesitas
dt = dt.drop(columns=['tfopwg_disp'])

# Asegurar que es_exoplaneta sea numérica (por si acaso)
dt['is_exoplanet'] = pd.to_numeric(dt['is_exoplanet'])

#print(dt.head())
#print(dt.es_exoplaneta.value_counts(),dt.shape)


In [4]:
# Guardamos el dataframe limpio
dt.to_csv('tess_cleaned.csv', index=False)

---
### BASE DE DATOS - K2

In [5]:
# ============================================
# K2: Limpieza mínima y dataset etiquetado
# ============================================

import pandas as pd
import numpy as np

FNAME = "k2pandc_2025.10.05_02.35.26.csv"
OUT   = "k2_cleaned.csv"

# 1) Carga (detección simple de separador)
def detect_sep(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if not line.startswith("#") and line.strip():
                return "\t" if "\t" in line else ","
    return ","

sep = detect_sep(FNAME)
df = pd.read_csv(FNAME, sep=sep, comment="#", low_memory=False)

# 2) Etiqueta binaria desde 'disposition'
disp_cols = [c for c in df.columns if "disposition" in c.lower()]
disp_col = disp_cols[0]

df[disp_col] = df[disp_col].astype(str).str.upper().str.strip()
is_conf = df[disp_col].str.contains("CONFIRMED", na=False)
is_fp   = df[disp_col].str.contains("FALSE POSITIVE", na=False) | df[disp_col].str.contains("REFUTED", na=False)

# nos quedamos solo con confirmados o falsos positivos
df = df.loc[ is_conf | is_fp ].copy()

# crear etiqueta y remover columna original
df["is_exoplanet"] = np.where(is_conf.loc[df.index], 1, 0)
df.drop(columns=[disp_col], inplace=True)

# 3) Variables relevantes
keep_cols = [
    "disposition", "pl_orbper", "st_rad", "sy_dist",
    "sy_vmag", "sy_jmag", "sy_gaiamag", "is_exoplanet"
]
keep_cols = [c for c in keep_cols if c in df.columns]
df = df[keep_cols].copy()

# 4) Definir X, y
X = df.drop(columns=["is_exoplanet"])
y = df["is_exoplanet"]

# 5) Guardar dataset limpio
df.to_csv(OUT, index=False)
print(f"Guardado: {OUT} | filas={len(df)} | cols={df.shape[1]}")


Guardado: k2_cleaned.csv | filas=2630 | cols=7


In [6]:
# Guardar dataset limpio
    # clean = X.copy()
    # clean["is_exoplanet"] = y
    # clean.to_csv("k2_cleaned.csv", index=False)
#print("Archivo guardado: k2_cleaned.csv")

# Creación de modelos

## MODELO BOOSTING - KEPLER

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# =======================================
# 1️⃣ CARGA
# =======================================
df = pd.read_csv("kepler_cleaned.csv", sep=None, engine="python")

# =======================================
# 2️⃣ SEPARACIÓN DE VARIABLES (split primero)
# =======================================
X = df.drop(columns=["is_exoplanet"])
y = df["is_exoplanet"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Columnas numéricas (derivadas del TRAIN)
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()


In [8]:
import joblib
import numpy as np

num_pipe = Pipeline([
    ("imp", KNNImputer(n_neighbors=5)),
    ("sc", StandardScaler()),
])
preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols)
], remainder="drop")

pipe = Pipeline([
    ("pre", preprocess),
    ("clf", GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,max_depth=3,random_state=42)
)
])

pipe.fit(X_train, y_train)

# Guarda el pipeline entero
joblib.dump(pipe, "boost_kepler.joblib")


['boost_kepler.joblib']

## MODELO BOOSTING - TESS

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


df = pd.read_csv("tess_cleaned.csv", sep=None, engine="python")

# ==============================
# 1) SEPARAR VARIABLES
# ==============================
X = df.drop(columns=["is_exoplanet"])
y = df["is_exoplanet"]

# Split primero (¡sin preprocesar antes!)s
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ==============================
# 2) DEFINIR COLUMNAS NUMÉRICAS
# ==============================
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()


In [10]:
import joblib
import numpy as np

num_pipe = Pipeline([
    ("imp", KNNImputer(n_neighbors=5)),
    ("sc", StandardScaler()),
])
preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols)
], remainder="drop")

pipe = Pipeline([
    ("pre", preprocess),
    ("clf", GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,max_depth=3,random_state=42)
)
])

pipe.fit(X_train, y_train)

# Guarda el pipeline entero
joblib.dump(pipe, "boost_tess.joblib")


['boost_tess.joblib']

## MODELO BOOSTING - K2

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

# =======================================
# CARGA
# =======================================
df = pd.read_csv("k2_cleaned.csv", engine="python")

# =======================================
# IMPUTACIÓN + ESCALADO 
# =======================================

# Etiqueta y predictores (usa 'is_exoplanet' del CSV limpio)
X = df.drop(columns=["is_exoplanet"])
y = df["is_exoplanet"]

# Split primero
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Columnas numéricas basadas en el TRAIN
num_cols = X_train.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns


In [12]:
import joblib
import numpy as np

# Supón que ya tienes X_train, y_train y num_cols definidos
num_pipe = Pipeline([
    ("imp", KNNImputer(n_neighbors=5)),
    ("sc", StandardScaler()),
])
preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols)
], remainder="drop")

pipe = Pipeline([
    ("pre", preprocess),
    ("clf", GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,max_depth=3,random_state=42)
)
])

# Entrena una sola vez
pipe.fit(X_train, y_train)

# Guarda el pipeline entero
joblib.dump(pipe, "boost_k2.joblib")


['boost_k2.joblib']