# Paso 1: Importar las librerías requeridas

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from datetime import datetime

# Paso 2: Leer el archivo CSV

In [2]:
EmpleadosAttrition = pd.read_csv('empleadosRETO.csv')


# Paso 3: Eliminar columnas innecesarias


In [3]:
columnas_a_eliminar = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
EmpleadosAttrition = EmpleadosAttrition.drop(columns=columnas_a_eliminar)

# Paso 4 y 5: Crear columna Year

In [4]:
EmpleadosAttrition['Year'] = pd.to_datetime(EmpleadosAttrition['HiringDate'], format='%d/%m/%Y', errors='coerce').dt.year
mask = EmpleadosAttrition['Year'].isna()
EmpleadosAttrition.loc[mask, 'Year'] = pd.to_datetime(EmpleadosAttrition.loc[mask, 'HiringDate'], format='%m/%d/%Y', errors='coerce').dt.year

mask = EmpleadosAttrition['Year'].isna()
EmpleadosAttrition.loc[mask, 'Year'] = pd.to_datetime(EmpleadosAttrition.loc[mask, 'HiringDate'], format='%Y-%m-%d', errors='coerce').dt.year

if EmpleadosAttrition['Year'].isna().any():
    print("Algunas fechas no pudieron ser parseadas.")
    #imprimir las filas problemáticas:
    print(EmpleadosAttrition[EmpleadosAttrition['Year'].isna()])

EmpleadosAttrition['Year'] = EmpleadosAttrition['Year'].fillna(0).astype(int)


Algunas fechas no pudieron ser parseadas.
     Age BusinessTravel Department DistanceFromHome  Education EducationField  \
229   36  Travel_Rarely      Sales            11 km          4      Marketing   

     EnvironmentSatisfaction  Gender  JobInvolvement  JobLevel  ...  \
229                        2  Female               2         2  ...   

    PercentSalaryHike  PerformanceRating RelationshipSatisfaction  \
229                13                  3                        1   

     TotalWorkingYears  TrainingTimesLastYear WorkLifeBalance  \
229                  8                      2               2   

    YearsInCurrentRole  YearsSinceLastPromotion  Attrition  Year  
229                  3                        0         No   NaN  

[1 rows x 27 columns]


In [5]:
EmpleadosAttrition.loc[229, "Year"] = 2012


# Paso 6: Crear columna YearsAtCompany

In [6]:
EmpleadosAttrition['YearsAtCompany'] = 2018 - EmpleadosAttrition['Year']


# Paso 7 y 8: Renombrar DistanceFromHome

In [7]:
EmpleadosAttrition = EmpleadosAttrition.rename(columns={'DistanceFromHome': 'DistanceFromHome_km'})


# Paso 9: Crear nueva variable DistanceFromHome como entero

In [8]:
EmpleadosAttrition['DistanceFromHome'] = EmpleadosAttrition['DistanceFromHome_km'].str.replace(' km', '').astype(int)


# Paso 10: Borrar columnas innecesarias

In [9]:
EmpleadosAttrition = EmpleadosAttrition.drop(columns=['Year', 'HiringDate', 'DistanceFromHome_km'])


# Paso 11: Crear frame SueldoPromedioDepto

In [10]:
SueldoPromedioDepto = EmpleadosAttrition.groupby('Department')['MonthlyIncome'].mean().reset_index()
SueldoPromedioDepto = SueldoPromedioDepto.rename(columns={'MonthlyIncome': 'SueldoPromedio'})


# Paso 12: Escalar MonthlyIncome


In [11]:
scaler = StandardScaler()
EmpleadosAttrition['MonthlyIncome'] = scaler.fit_transform(EmpleadosAttrition[['MonthlyIncome']])


In [12]:
EmpleadosAttrition.Attrition.unique()

array(['No', 'Yes'], dtype=object)

# Paso 13: Convertir variables categóricas a numéricas


In [13]:
categorical_columns = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Attrition']
EmpleadosAttrition = pd.get_dummies(EmpleadosAttrition, columns=categorical_columns, drop_first=True)

print(EmpleadosAttrition.columns)

Index(['Age', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement',
       'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsAtCompany', 'DistanceFromHome',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gender_Male',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'Ma

In [14]:
non_numeric_columns = EmpleadosAttrition.select_dtypes(exclude=[np.number]).columns
print("Columnas no numéricas:", non_numeric_columns)
EmpleadosAttrition = pd.get_dummies(EmpleadosAttrition, columns=non_numeric_columns, drop_first=True)
print("¿Todas las columnas son numéricas?", EmpleadosAttrition.select_dtypes(include=[np.number]).columns.equals(EmpleadosAttrition.columns))

Columnas no numéricas: Index(['OverTime', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Research & Development',
       'Department_Sales', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'Gender_Male', 'JobRole_Human Resources',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Research Scientist', 'JobRole_Sales Executive',
       'JobRole_Sales Representative', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'Attrition_Yes'],
      dtype='object')
¿Todas las columnas son numéricas? False


# Paso 14: Calcular correlación con Attrition


In [15]:
correlations = EmpleadosAttrition.corr()['Attrition_Yes_True'].abs().sort_values(ascending=False)

print(correlations)

Attrition_Yes_True                        1.000000
OverTime_Yes                              0.324777
JobLevel                                  0.214266
TotalWorkingYears                         0.213329
Age                                       0.212121
MaritalStatus_Single_True                 0.205849
YearsInCurrentRole                        0.203918
MonthlyIncome                             0.194936
JobRole_Sales Representative_True         0.191294
YearsAtCompany                            0.176001
JobInvolvement                            0.166785
JobSatisfaction                           0.164957
EducationField_Technical Degree_True      0.129104
JobRole_Laboratory Technician_True        0.125264
EnvironmentSatisfaction                   0.124327
JobRole_Research Director_True            0.116263
MaritalStatus_Married_True                0.094734
JobRole_Manager_True                      0.089885
Department_Research & Development_True    0.072269
TrainingTimesLastYear          

# Paso 15: Seleccionar variables con correlación >= 0.1


In [16]:
selected_columns = correlations[correlations >= 0.1].index.tolist()
EmpleadosAttritionFinal = EmpleadosAttrition[selected_columns]


# Paso 16: Crear EmpleadosAttritionPCA


In [17]:
pca = PCA()
EmpleadosAttritionPCA = pca.fit_transform(EmpleadosAttritionFinal.drop('Attrition_Yes_True', axis=1))


# Paso 17: Agregar componentes principales que expliquen el 80% de la varianza


In [18]:
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance_ratio >= 0.8) + 1

for i in range(n_components):
    EmpleadosAttritionFinal = EmpleadosAttritionFinal.assign(**{f'C{i}': EmpleadosAttritionPCA[:, i]})


# Paso 18: Guardar el set de datos final


In [19]:
EmpleadosAttritionFinal.to_csv('EmpleadosAttritionFinal.csv', index=False)
