In [6]:
# limpiar_y_guardar_datos.py

import pandas as pd
import numpy as np

# 1. Cargar el dataset
df = pd.read_csv("data/survey_results_public.csv")
print("📦 Forma original:", df.shape)

# 2. Seleccionar columnas disponibles
columnas_modelo = [
    "Country",
    "EdLevel",
    "YearsCodePro",
    "Employment",
    "RemoteWork",
    "OrgSize",
    "DevType",
    "AISelect",
    "ConvertedCompYearly"
]
df_modelo = df[columnas_modelo].copy()

# 3. Limpiar valores nulos y experiencia
df_modelo.dropna(subset=["ConvertedCompYearly", "YearsCodePro"], inplace=True)
df_modelo["YearsCodePro"] = df_modelo["YearsCodePro"].apply(
    lambda x: 0.5 if x == "Less than 1 year"
    else 51 if x == "More than 50 years"
    else float(x) if str(x).replace(".", "").isdigit()
    else np.nan
)
df_modelo.dropna(subset=["YearsCodePro"], inplace=True)

# 4. Eliminar outliers en salario
q1 = df_modelo["ConvertedCompYearly"].quantile(0.25)
q3 = df_modelo["ConvertedCompYearly"].quantile(0.75)
iqr = q3 - q1
lim_inf = q1 - 1.5 * iqr
lim_sup = q3 + 1.5 * iqr
df_modelo = df_modelo[(df_modelo["ConvertedCompYearly"] >= lim_inf) & (df_modelo["ConvertedCompYearly"] <= lim_sup)]

# 5. Limpiar categóricas
def limpiar_nivel(x):
    if "Bachelor" in x:
        return "Bachelor"
    elif "Master" in x:
        return "Master"
    elif "Professional" in x or "doctoral" in x or "Ph.D" in x:
        return "Doctorate"
    elif "Some college" in x or "Secondary" in x or "Primary" in x:
        return "NoDegree"
    else:
        return "Other"

df_modelo["EdLevel"] = df_modelo["EdLevel"].apply(limpiar_nivel)
df_modelo["Employment"] = df_modelo["Employment"].apply(lambda x: x.split(";")[0].strip())
df_modelo["RemoteWork"] = df_modelo["RemoteWork"].fillna("Unknown")
df_modelo["DevType"] = df_modelo["DevType"].fillna("Unknown")
df_modelo["AISelect"] = df_modelo["AISelect"].fillna("Unknown")

# 6. Codificar variables categóricas
df_final = pd.get_dummies(
    df_modelo,
    columns=["Country", "EdLevel", "Employment", "RemoteWork", "OrgSize", "DevType", "AISelect"],
    drop_first=True
)

# 💰 7. Calcular estadísticas salariales globales
salario_promedio = df_modelo["ConvertedCompYearly"].mean()
print(f"💰 Salario promedio global: ${salario_promedio:,.2f}")

# 8. Guardar resultados
df_final.to_csv("data/datos_limpios_modelo.csv", index=False)
print("✅ Datos limpios guardados en: data/datos_limpios_modelo.csv")
print("🔍 Forma final:", df_final.shape)


📦 Forma original: (65437, 114)
💰 Salario promedio global: $71,426.66
✅ Datos limpios guardados en: data/datos_limpios_modelo.csv
🔍 Forma final: (22372, 224)


In [None]:
# # limpiar_y_guardar_datos.py

# import pandas as pd
# import numpy as np

# # 1. Cargar el dataset
# df = pd.read_csv("data/survey_results_public.csv")
# print("📦 Forma original:", df.shape)

# # 2. Seleccionar columnas disponibles
# columnas_modelo = [
#     "Country",  # <- necesario solo para filtrar
#     "EdLevel",
#     "YearsCodePro",
#     "Employment",
#     "RemoteWork",
#     "OrgSize",
#     "DevType",
#     "AISelect",
#     "ConvertedCompYearly"
# ]
# df_modelo = df[columnas_modelo].copy()

# # 3. Filtrar solo Estados Unidos (✅)
# df_modelo = df_modelo[df_modelo["Country"] == "United States of America"]

# # 4. Limpiar valores nulos y experiencia
# df_modelo.dropna(subset=["ConvertedCompYearly", "YearsCodePro"], inplace=True)
# df_modelo["YearsCodePro"] = df_modelo["YearsCodePro"].apply(
#     lambda x: 0.5 if x == "Less than 1 year"
#     else 51 if x == "More than 50 years"
#     else float(x) if str(x).replace(".", "").isdigit()
#     else np.nan
# )
# df_modelo.dropna(subset=["YearsCodePro"], inplace=True)

# # 5. Eliminar outliers en salario
# q1 = df_modelo["ConvertedCompYearly"].quantile(0.25)
# q3 = df_modelo["ConvertedCompYearly"].quantile(0.75)
# iqr = q3 - q1
# lim_inf = q1 - 1.5 * iqr
# lim_sup = q3 + 1.5 * iqr
# df_modelo = df_modelo[
#     (df_modelo["ConvertedCompYearly"] >= lim_inf) &
#     (df_modelo["ConvertedCompYearly"] <= lim_sup)
# ]

# # 6. Limpiar categóricas
# def limpiar_nivel(x):
#     if "Bachelor" in x:
#         return "Bachelor"
#     elif "Master" in x:
#         return "Master"
#     elif "Professional" in x or "doctoral" in x or "Ph.D" in x:
#         return "Doctorate"
#     elif "Some college" in x or "Secondary" in x or "Primary" in x:
#         return "NoDegree"
#     else:
#         return "Other"

# df_modelo["EdLevel"] = df_modelo["EdLevel"].apply(limpiar_nivel)
# df_modelo["Employment"] = df_modelo["Employment"].apply(lambda x: x.split(";")[0].strip())
# df_modelo["RemoteWork"] = df_modelo["RemoteWork"].fillna("Unknown")
# df_modelo["DevType"] = df_modelo["DevType"].fillna("Unknown")
# df_modelo["AISelect"] = df_modelo["AISelect"].fillna("Unknown")
# df_modelo["OrgSize"] = df_modelo["OrgSize"].fillna("Unknown")

# # 7. Eliminar columna Country (❌ ya no se usa)
# df_modelo.drop(columns=["Country"], inplace=True)

# # 8. Codificar variables categóricas
# df_final = pd.get_dummies(df_modelo, columns=[
#     "EdLevel", "Employment", "RemoteWork", "OrgSize", "DevType", "AISelect"
# ], drop_first=True)

# # 9. Guardar resultados
# df_final.to_csv("data/datos_limpios_modelo.csv", index=False)
# print("✅ Datos limpios guardados en: data/datos_limpios_modelo.csv")
# print("🔍 Forma final:", df_final.shape)


📦 Forma original: (65437, 114)
✅ Datos limpios guardados en: data/datos_limpios_modelo.csv
🔍 Forma final: (4436, 60)
