In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('hr.csv')

In [9]:
def eda_revision_completa(df):
    """ EDA avanzada para revisi√≥n r√°pida de DataFrames con: 
        - Estad√≠sticas num√©ricas y categ√≥ricas 
        - Alertas de calidad y outliers 
        - Granularidad de nulos 
        - Sugerencias de mapeo para escalas ordinales 1-4 
        """
    filas, columnas = df.shape
    print("üìê DIMENSIONES Y DUPLICADOS")
    print(f"Filas: {filas} | Columnas: {columnas}")
    duplicados = df.duplicated().sum()
    print(f"Filas duplicadas: {duplicados} ({duplicados/filas*100:.2f}%)")
    print("-"*50)

    # ----------------------------
    # Calidad de columnas
    # ----------------------------
    print("üöë CALIDAD DE COLUMNAS")
    calidad = pd.DataFrame({
        'Tipo': df.dtypes,
        'Nulos': df.isnull().sum(),
        '% Nulos': (df.isnull().sum() / filas * 100).round(2),
        'Valores √önicos': df.nunique(),
        '% Cardinalidad': (df.nunique() / filas * 100).round(2)
    })
    display(calidad)
    
    # Granularidad de nulos
    print("\nüìå NULOS - RESUMEN POR UMBRALES")
    umbrales = [0, 10, 25, 50, 75, 90]
    for umbral in umbrales:
        cols = calidad[calidad['% Nulos'] > umbral].index.tolist()
        if cols:
            print(f"- Columnas con >{umbral}% nulos: {cols}")

    # ----------------------------
    # Alertas de calidad
    # ----------------------------
    print("\nüö® ALERTAS DE CALIDAD")
    constantes = calidad[calidad['Valores √önicos']==1].index.tolist()
    if constantes:
        print(f"‚ö†Ô∏è Columnas sin variaci√≥n (se pueden eliminar): {constantes}")
    posibles_ids = calidad[calidad['% Cardinalidad']>95].index.tolist()
    if posibles_ids:
        print(f"üÜî Posibles IDs o √≠ndices: {posibles_ids}")
    cols_cat = df.select_dtypes(include=['object','category']).columns
    for col in cols_cat:
        rare = df[col].value_counts(normalize=True) < 0.05
        if rare.any():
            print(f"üéØ {col} tiene categor√≠as raras (<5% de ocurrencias): {list(rare[rare].index)}")

    # ----------------------------
    # Estad√≠sticas num√©ricas
    # ----------------------------
    print("\nüìä ESTAD√çSTICAS NUM√âRICAS")
    num_cols = df.select_dtypes(include=['int64','float64']).columns
    display(df[num_cols].describe().T)

    print("\n‚ö†Ô∏è DETECCI√ìN DE OUTLIERS (IQR 1.5x)")
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)][col].count()
        if outliers > 0:
            print(f"- {col}: {outliers} outliers ({outliers/filas*100:.2f}%)")

    # ----------------------------
    # Estad√≠sticas categ√≥ricas
    # ----------------------------
    print("\nüî§ ESTAD√çSTICAS CATEG√ìRICAS")
    if len(cols_cat) > 0:
        desc_cat = df[cols_cat].describe().T
        display(desc_cat)
        for col in cols_cat:
            print(f"- {col} top 3 valores:")
            vc = df[col].value_counts(dropna=False)
            print(vc.head(3).to_dict())
            print("")

    # ----------------------------
    # Sugerencias de mapeo para escalas ordinales 1-4
    # ----------------------------
    print("\nüí° SUGERENCIAS DE MAPEO PARA ESCALAS 1-4")
    escala_map = {1:'Muy Insatisfecho',2:'Insatisfecho',3:'Satisfecho',4:'Muy Satisfecho'}
    cols_ordinales = [col for col in num_cols if set(df[col].dropna().unique()).issubset({1,2,3,4})]
    if cols_ordinales:
        print(f"Columnas sugeridas para mapear: {cols_ordinales}")
        print("Escala sugerida:")
        for val,label in escala_map.items():
            print(f"{val} -> {label}")
    else:
        print("No se detectaron columnas con escala 1-4 para mapear.")

    print("\nüèÅ REVISI√ìN COMPLETA FINALIZADA")

In [10]:
eda_revision_completa(df)

üìê DIMENSIONES Y DUPLICADOS
Filas: 1474 | Columnas: 35
Filas duplicadas: 4 (0.27%)
--------------------------------------------------
üöë CALIDAD DE COLUMNAS


Unnamed: 0,Tipo,Nulos,% Nulos,Valores √önicos,% Cardinalidad
Age,float64,73,4.95,43,2.92
Attrition,object,0,0.0,2,0.14
BusinessTravel,object,117,7.94,3,0.2
DailyRate,int64,0,0.0,886,60.11
Department,object,29,1.97,3,0.2
DistanceFromHome,int64,0,0.0,29,1.97
Education,int64,0,0.0,5,0.34
EducationField,object,58,3.93,6,0.41
EmployeeCount,int64,0,0.0,1,0.07
EmployeeNumber,int64,0,0.0,1470,99.73



üìå NULOS - RESUMEN POR UMBRALES
- Columnas con >0% nulos: ['Age', 'BusinessTravel', 'Department', 'EducationField', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'OverTime', 'StandardHours', 'TrainingTimesLastYear', 'YearsWithCurrManager']
- Columnas con >10% nulos: ['StandardHours', 'YearsWithCurrManager']

üö® ALERTAS DE CALIDAD
‚ö†Ô∏è Columnas sin variaci√≥n (se pueden eliminar): ['EmployeeCount', 'Over18', 'StandardHours']
üÜî Posibles IDs o √≠ndices: ['EmployeeNumber', 'MonthlyRate']
üéØ Department tiene categor√≠as raras (<5% de ocurrencias): ['Human Resources']
üéØ EducationField tiene categor√≠as raras (<5% de ocurrencias): ['Human Resources']
üéØ JobRole tiene categor√≠as raras (<5% de ocurrencias): [' hUMAN rESOURCES ']
üéØ MaritalStatus tiene categor√≠as raras (<5% de ocurrencias): ['Marreid']

üìä ESTAD√çSTICAS NUM√âRICAS


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1401.0,36.944325,9.105009,18.0,30.0,36.0,43.0,60.0
DailyRate,1474.0,802.702171,403.53953,102.0,465.0,803.0,1157.0,1499.0
DistanceFromHome,1474.0,9.199457,8.104266,1.0,2.0,7.0,14.0,29.0
Education,1474.0,2.911126,1.024267,1.0,2.0,3.0,4.0,5.0
EmployeeCount,1474.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1474.0,1024.471506,602.421193,1.0,488.75,1020.5,1555.75,2068.0
EnvironmentSatisfaction,1474.0,2.723881,1.09328,1.0,2.0,3.0,4.0,4.0
HourlyRate,1474.0,65.887381,20.310444,30.0,48.0,66.0,83.75,100.0
JobInvolvement,1474.0,2.729986,0.712861,1.0,2.0,3.0,3.0,4.0
JobLevel,1474.0,2.063772,1.106055,1.0,1.0,2.0,3.0,5.0



‚ö†Ô∏è DETECCI√ìN DE OUTLIERS (IQR 1.5x)
- MonthlyIncome: 113 outliers (7.67%)
- NumCompaniesWorked: 52 outliers (3.53%)
- PerformanceRating: 226 outliers (15.33%)
- StockOptionLevel: 85 outliers (5.77%)
- TotalWorkingYears: 63 outliers (4.27%)
- TrainingTimesLastYear: 227 outliers (15.40%)
- YearsAtCompany: 104 outliers (7.06%)
- YearsInCurrentRole: 21 outliers (1.42%)
- YearsSinceLastPromotion: 107 outliers (7.26%)
- YearsWithCurrManager: 12 outliers (0.81%)

üî§ ESTAD√çSTICAS CATEG√ìRICAS


Unnamed: 0,count,unique,top,freq
Attrition,1474,2,No,1236
BusinessTravel,1357,3,Travel_Rarely,955
Department,1445,3,Research & Development,941
EducationField,1416,6,Life Sciences,582
Gender,1474,2,Male,885
JobRole,1474,9,sALES eXECUTIVE,327
MaritalStatus,1342,4,Married,604
Over18,1474,1,Y,1474
OverTime,1430,2,No,1025


- Attrition top 3 valores:
{'No': 1236, 'Yes': 238}

- BusinessTravel top 3 valores:
{'Travel_Rarely': 955, 'Travel_Frequently': 263, 'Non-Travel': 139}

- Department top 3 valores:
{'Research & Development': 941, 'Sales': 440, 'Human Resources': 64}

- EducationField top 3 valores:
{'Life Sciences': 582, 'Medical': 449, 'Marketing': 152}

- Gender top 3 valores:
{'Male': 885, 'Female': 589}

- JobRole top 3 valores:
{' sALES eXECUTIVE ': 327, ' rESEARCH sCIENTIST ': 293, ' lABORATORY tECHNICIAN ': 259}

- MaritalStatus top 3 valores:
{'Married': 604, 'Single': 437, 'Divorced': 298}

- Over18 top 3 valores:
{'Y': 1474}

- OverTime top 3 valores:
{'No': 1025, 'Yes': 405, nan: 44}


üí° SUGERENCIAS DE MAPEO PARA ESCALAS 1-4
Columnas sugeridas para mapear: ['EmployeeCount', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance']
Escala sugerida:
1 -> Muy Insatisfecho
2 -> Insatisfecho
3 -> Satisfecho
4 -> Muy Sat

In [None]:
# Limpieza espec√≠fica del dataset
df.set_index(['EmployeeNumber'], inplace = True)
df.index.name = 'ID'
df.drop("Over18", axis = 1, inplace = True)
df.drop("StandardHours", axis = 1, inplace = True)
df.drop("EmployeeCount", axis = 1, inplace = True)
df = df.astype({'Age': 'Int64', 'DailyRate': float, 'HourlyRate': float, 'JobSatisfaction': 'Int64', 'MonthlyRate': float, 'TrainingTimesLastYear': 'Int64', 'YearsWithCurrManager': 'Int64'})
df['MaritalStatus'] = df['MaritalStatus'].str.replace('Marreid', 'Married')
df['JobRole'] = df['JobRole'].str.strip().str.title()
df['BusinessTravel'] = df['BusinessTravel'].str.replace('Travel_Rarely', 'Rarely').str.replace('Travel_Frequently', 'Frequently').str.replace('Non-Travel', 'Non')

In [12]:
df.head()

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,41,Yes,Rarely,1102.0,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
2,49,No,Frequently,279.0,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
4,37,Yes,Rarely,1373.0,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
5,33,No,Frequently,1392.0,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
7,27,No,Rarely,591.0,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


In [13]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')