## Limpieza

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [3]:
df = pd.read_csv('../data/raw/hr.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,sALES eXECUTIVE,4.0,Single,5993.0,19479,8,Y,Yes,11,3,1,80.0,0,8,0.0,1,6,4,0,5.0
1,49.0,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,rESEARCH sCIENTIST,2.0,Married,5130.0,24907,1,Y,No,23,4,4,,1,10,3.0,3,10,7,1,7.0
2,37.0,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,lABORATORY tECHNICIAN,3.0,Single,2090.0,2396,6,Y,Yes,15,3,2,,0,7,3.0,3,0,0,0,0.0
3,33.0,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,rESEARCH sCIENTIST,3.0,Married,2909.0,23159,1,Y,Yes,11,3,3,80.0,0,8,3.0,3,8,7,3,0.0
4,27.0,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,lABORATORY tECHNICIAN,2.0,Married,3468.0,16632,9,Y,No,12,3,4,80.0,1,6,3.0,3,2,2,2,2.0


La columna 'EmployeeNumber' pasa a ser el índice y la renombramos como 'ID'

In [4]:
df.set_index(['EmployeeNumber'], inplace = True)

df.index.name = "ID"

Borramos 3 columnas

In [5]:
df.drop("Over18", axis = 1, inplace = True)
df.drop("StandardHours", axis = 1, inplace = True)
df.drop("EmployeeCount", axis = 1, inplace = True)

Cambiamos el tipo de datos en varias columnas

In [6]:
df = df.astype({
    'Age': 'Int64', 
    'DailyRate': float, 
    'HourlyRate': float, 
    'JobSatisfaction': 'Int64', 
    'MonthlyRate': float, 
    'TrainingTimesLastYear': 'Int64', 
    'YearsWithCurrManager': 'Int64'
})

Arreglamos celdas mal escritas y cambiamos los nombres de la columna 'BusinessTravel'

In [7]:
df['MaritalStatus'] = df['MaritalStatus'].str.replace('Marreid', 'Married')
df['JobRole'] = df['JobRole'].str.strip().str.title()
df['BusinessTravel'] = df['BusinessTravel'].str.replace('Travel_Rarely', 'Rarely').str.replace('Travel_Frequently', 'Frequently').str.replace('Non-Travel', 'Non')

Creamos diccionarios para mapear las celdas de las columnas con valores númericos que nos interesa mejor tenerlos tipados

In [8]:
dict_encuestas = {1: "Nada satisfecho", 2: "Insatisfecho", 3: "Satisfecho", 4: "Muy satisfecho"}
dict_education = {1: "Sin estudios", 2: "Educacion básica", 3:"FP/Bachiller", 4:"Estudios universitarios", 5:"Estudios de post-grado"}
dict_joblevel = {1: "Becario", 2: "Junior", 3: "Senior", 4: "Manager", 5: "Director"}

Cambiamos todos los valores de las columnas de las encuestas y de la de formación y nivel con los diccionarios creados

In [10]:
df["EnvironmentSatisfaction"] = df["EnvironmentSatisfaction"].map(dict_encuestas)
df["JobInvolvement"] = df["JobInvolvement"].map(dict_encuestas)
df["JobSatisfaction"] = df["JobSatisfaction"].map(dict_encuestas)
df["PerformanceRating"] = df["PerformanceRating"].map(dict_encuestas)
df["RelationshipSatisfaction"] = df["RelationshipSatisfaction"].map(dict_encuestas)
df["WorkLifeBalance"] = df["WorkLifeBalance"].map(dict_encuestas)

In [11]:
df["Education"] = df["Education"].map(dict_education)

In [12]:
df["JobLevel"] = df["JobLevel"].map(dict_joblevel)

Comprobamos que los cambios se han efectuado

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1474 entries, 1 to 86
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1401 non-null   Int64  
 1   Attrition                 1474 non-null   object 
 2   BusinessTravel            1357 non-null   object 
 3   DailyRate                 1474 non-null   float64
 4   Department                1445 non-null   object 
 5   DistanceFromHome          1474 non-null   int64  
 6   Education                 1474 non-null   object 
 7   EducationField            1416 non-null   object 
 8   EnvironmentSatisfaction   1474 non-null   object 
 9   Gender                    1474 non-null   object 
 10  HourlyRate                1474 non-null   float64
 11  JobInvolvement            1474 non-null   object 
 12  JobLevel                  1474 non-null   object 
 13  JobRole                   1474 non-null   object 
 14  JobSatisfaction

In [14]:
df.head()

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
1,41,Yes,Rarely,1102.0,Sales,1,Educacion básica,Life Sciences,Insatisfecho,Female,94.0,Satisfecho,Junior,Sales Executive,Muy satisfecho,Single,5993.0,19479.0,8,Yes,11,Satisfecho,Nada satisfecho,0,8,0,Nada satisfecho,6,4,0,5
2,49,No,Frequently,279.0,Research & Development,8,Sin estudios,Life Sciences,Satisfecho,Male,61.0,Insatisfecho,Junior,Research Scientist,Insatisfecho,Married,5130.0,24907.0,1,No,23,Muy satisfecho,Muy satisfecho,1,10,3,Satisfecho,10,7,1,7
4,37,Yes,Rarely,1373.0,Research & Development,2,Educacion básica,Other,Muy satisfecho,Male,92.0,Insatisfecho,Becario,Laboratory Technician,Satisfecho,Single,2090.0,2396.0,6,Yes,15,Satisfecho,Insatisfecho,0,7,3,Satisfecho,0,0,0,0
5,33,No,Frequently,1392.0,Research & Development,3,Estudios universitarios,Life Sciences,Muy satisfecho,Female,56.0,Satisfecho,Becario,Research Scientist,Satisfecho,Married,2909.0,23159.0,1,Yes,11,Satisfecho,Satisfecho,0,8,3,Satisfecho,8,7,3,0
7,27,No,Rarely,591.0,Research & Development,2,Sin estudios,Medical,Nada satisfecho,Male,40.0,Satisfecho,Becario,Laboratory Technician,Insatisfecho,Married,3468.0,16632.0,9,No,12,Satisfecho,Muy satisfecho,1,6,3,Satisfecho,2,2,2,2


Lo pasamos a un csv ya limpio

In [15]:
df.to_csv('../data/processed/hr_processed.csv')

In [None]:
import pandas as pd
import numpy as np

def clean_hr_data(df):
    """
    Limpia, estandariza y prepara el dataset de RRHH para análisis,
    visualización, bases de datos y procesos ETL.

    Incluye:
    - Homogeneización de nombres de columnas (snake_case)
    - Eliminación de columnas sin valor analítico
    - Corrección de errores tipográficos
    - Tratamiento de valores nulos
    - Mapeo semántico de escalas ordinales
    - Aseguramiento de tipos de datos
    """

    # ===============================
    # 0. Copia defensiva
    # ===============================
    df = df.copy()

    # ===============================
    # 1. Homogeneización de columnas
    # ===============================
    
    # Estandarizamos los nombres de las columnas siguiendo snake_case para garantizar compatibilidad con Python, SQL y procesos ETL, 
    # reduciendo errores y facilitando el mantenimiento futuro del sistema.
    rename_columns = {
        "Age": "age",
        "Attrition": "attrition",
        "BusinessTravel": "business_travel",
        "DailyRate": "daily_rate",
        "Department": "department",
        "DistanceFromHome": "distance_from_home",
        "Education": "education",
        "EducationField": "education_field",
        "EmployeeNumber": "employee_id",
        "EnvironmentSatisfaction": "env_satisfaction",
        "Gender": "gender",
        "HourlyRate": "hourly_rate",
        "JobInvolvement": "job_involvement",
        "JobLevel": "job_level",
        "JobRole": "job_role",
        "JobSatisfaction": "job_satisfaction",
        "MaritalStatus": "marital_status",
        "MonthlyIncome": "monthly_income",
        "MonthlyRate": "monthly_rate",
        "NumCompaniesWorked": "num_companies_worked",
        "OverTime": "over_time",
        "PercentSalaryHike": "salary_hike_pct",
        "PerformanceRating": "performance_score",
        "RelationshipSatisfaction": "rel_satisfaction",
        "StockOptionLevel": "stock_option_level",
        "TotalWorkingYears": "total_years_worked",
        "TrainingTimesLastYear": "training_last_year",
        "WorkLifeBalance": "work_life_balance",
        "YearsAtCompany": "years_at_company",
        "YearsInCurrentRole": "years_in_current_role",
        "YearsSinceLastPromotion": "years_since_last_promotion",
        "YearsWithCurrManager": "years_with_current_manager",
        "Over18": "over_18",
        "StandardHours": "standard_hours",
        "EmployeeCount": "employee_count",
    }

    df.rename(columns=rename_columns, inplace=True)

    # ===============================
    # 2. Índice e identificadores
    # ===============================
    df.set_index("employee_id", inplace=True)

    # ===============================
    # 3. Eliminación de columnas sin valor analítico
    # ===============================
    cols_to_drop = ["over_18", "standard_hours", "employee_count"]
    df.drop(columns=cols_to_drop, inplace=True, errors="ignore")

    # ===============================
    # 4. Normalización de texto
    # ===============================
    df["marital_status"] = df["marital_status"].str.replace("Marreid", "Married")
    df["job_role"] = df["job_role"].str.strip().str.title()

    df["business_travel"] = (df["business_travel"]
        .str.replace("Travel_Rarely", "Rarely")
        .str.replace("Travel_Frequently", "Frequently")
        .str.replace("Non-Travel", "Non")
    )

    # ===============================
    # 5. Tratamiento de valores nulos
    # ===============================

    # --- Categóricas: 
    

    # --- Numéricas: imputación robusta


    # ===============================
    # 6. Tipos de datos
    # ===============================
    dtype_mapping = {
        "age": "Int64",
        "daily_rate": float,
        "hourly_rate": float,
        "monthly_rate": float,
        "training_last_year": "Int64",
        "years_with_current_manager": "Int64",
    }

    df = df.astype(dtype_mapping)

    # ===============================
    # 7. Mapeo semántico de escalas ordinales
    # ===============================
    satisfaction_map = {
        1: "Nada satisfecho",
        2: "Insatisfecho",
        3: "Satisfecho",
        4: "Muy satisfecho",
    }

    education_map = {
        1: "Sin estudios",
        2: "Educación básica",
        3: "FP/Bachiller",
        4: "Estudios universitarios",
        5: "Postgrado",
    }

    job_level_map = {
        1: "Becario",
        2: "Junior",
        3: "Senior",
        4: "Manager",
        5: "Director",
    }

    satisfaction_cols = [
        "env_satisfaction",
        "job_involvement",
        "job_satisfaction",
        "performance_score",
        "rel_satisfaction",
        "work_life_balance",
    ]

    for col in satisfaction_cols:
        if col in df.columns:
            df[col] = df[col].map(satisfaction_map)

    df["education"] = df["education"].map(education_map)
    df["job_level"] = df["job_level"].map(job_level_map)

    # ===============================
    # 8. Validación final
    # ===============================
    return df