In [33]:
# Paso 1: Importar las librerías requeridas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Paso 2: Leer el archivo CSV y colocar los datos en un frame de Pandas
EmpleadosAttrition = pd.read_csv('empleadosRETO.csv')

# Muestra las primeras filas del frame para verificar la lectura correcta de los datos
EmpleadosAttrition.head()


Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition
0,50,Travel_Rarely,Research & Development,1 km,2,Medical,1,997,4,Male,...,22,4,3,80,32,1,2,4,1,No
1,36,Travel_Rarely,Research & Development,6 km,2,Medical,1,178,2,Male,...,20,4,4,80,7,0,3,2,0,No
2,21,Travel_Rarely,Sales,7 km,1,Marketing,1,1780,2,Male,...,13,3,2,80,1,3,3,0,1,Yes
3,52,Travel_Rarely,Research & Development,7 km,4,Life Sciences,1,1118,2,Male,...,19,3,4,80,18,4,3,6,4,No
4,33,Travel_Rarely,Research & Development,15 km,1,Medical,1,582,2,Male,...,12,3,4,80,15,2,4,6,7,Yes


In [34]:
# Elimina las columnas mencionadas
columns_to_drop = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
EmpleadosAttrition = EmpleadosAttrition.drop(columns=columns_to_drop, axis=1)

# Muestra las primeras filas del dataframe después de eliminar las columnas
EmpleadosAttrition.head()


Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,...,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition
0,50,Travel_Rarely,Research & Development,1 km,2,Medical,4,Male,3,4,...,No,22,4,3,32,1,2,4,1,No
1,36,Travel_Rarely,Research & Development,6 km,2,Medical,2,Male,3,2,...,No,20,4,4,7,0,3,2,0,No
2,21,Travel_Rarely,Sales,7 km,1,Marketing,2,Male,3,1,...,No,13,3,2,1,3,3,0,1,Yes
3,52,Travel_Rarely,Research & Development,7 km,4,Life Sciences,2,Male,3,3,...,No,19,3,4,18,4,3,6,4,No
4,33,Travel_Rarely,Research & Development,15 km,1,Medical,2,Male,3,3,...,Yes,12,3,4,15,2,4,6,7,Yes


In [35]:
# Paso 4a: Crear la columna 'Year' y obtener el año de contratación
EmpleadosAttrition['HiringDate'] = pd.to_datetime(EmpleadosAttrition['HiringDate'], format='%m/%d/%Y', errors='coerce')
EmpleadosAttrition = EmpleadosAttrition.dropna(subset=['HiringDate'])  # Elimina filas con fechas nulas
EmpleadosAttrition['Year'] = EmpleadosAttrition['HiringDate'].dt.year.astype(int)

# Paso 4b: Crear la columna 'YearsAtCompany' con los años que el empleado lleva en la compañía hasta el año 2018
EmpleadosAttrition['YearsAtCompany'] = 2018 - EmpleadosAttrition['Year']

# Muestra las primeras filas del dataframe después de agregar las nuevas columnas
EmpleadosAttrition.head()


Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,...,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition,Year,YearsAtCompany
0,50,Travel_Rarely,Research & Development,1 km,2,Medical,4,Male,3,4,...,4,3,32,1,2,4,1,No,2013,5
1,36,Travel_Rarely,Research & Development,6 km,2,Medical,2,Male,3,2,...,4,4,7,0,3,2,0,No,2015,3
2,21,Travel_Rarely,Sales,7 km,1,Marketing,2,Male,3,1,...,3,2,1,3,3,0,1,Yes,2017,1
3,52,Travel_Rarely,Research & Development,7 km,4,Life Sciences,2,Male,3,3,...,3,4,18,4,3,6,4,No,2010,8
4,33,Travel_Rarely,Research & Development,15 km,1,Medical,2,Male,3,3,...,3,4,15,2,4,6,7,Yes,2011,7


In [36]:
# Paso 5a: Renombrar la variable DistanceFromHome a DistanceFromHome_km
EmpleadosAttrition.rename(columns={'DistanceFromHome': 'DistanceFromHome_km'}, inplace=True)

# Paso 5b: Convertir los datos de DistanceFromHome_km a entero
EmpleadosAttrition['DistanceFromHome_km'] = EmpleadosAttrition['DistanceFromHome_km'].str.extract('(\d+)').astype(int)

# Muestra las primeras filas del dataframe después de realizar las modificaciones
EmpleadosAttrition.head()



Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome_km,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,...,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition,Year,YearsAtCompany
0,50,Travel_Rarely,Research & Development,1,2,Medical,4,Male,3,4,...,4,3,32,1,2,4,1,No,2013,5
1,36,Travel_Rarely,Research & Development,6,2,Medical,2,Male,3,2,...,4,4,7,0,3,2,0,No,2015,3
2,21,Travel_Rarely,Sales,7,1,Marketing,2,Male,3,1,...,3,2,1,3,3,0,1,Yes,2017,1
3,52,Travel_Rarely,Research & Development,7,4,Life Sciences,2,Male,3,3,...,3,4,18,4,3,6,4,No,2010,8
4,33,Travel_Rarely,Research & Development,15,1,Medical,2,Male,3,3,...,3,4,15,2,4,6,7,Yes,2011,7


In [37]:
# Paso 6: Borrar las columnas innecesarias
columnas_a_borrar = ['Year', 'HiringDate', 'DistanceFromHome_km']
EmpleadosAttrition = EmpleadosAttrition.drop(columns=columnas_a_borrar, axis=1)

# Muestra las primeras filas del dataframe después de borrar las columnas
EmpleadosAttrition.head()


Unnamed: 0,Age,BusinessTravel,Department,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition,YearsAtCompany
0,50,Travel_Rarely,Research & Development,2,Medical,4,Male,3,4,Research Director,...,22,4,3,32,1,2,4,1,No,5
1,36,Travel_Rarely,Research & Development,2,Medical,2,Male,3,2,Manufacturing Director,...,20,4,4,7,0,3,2,0,No,3
2,21,Travel_Rarely,Sales,1,Marketing,2,Male,3,1,Sales Representative,...,13,3,2,1,3,3,0,1,Yes,1
3,52,Travel_Rarely,Research & Development,4,Life Sciences,2,Male,3,3,Healthcare Representative,...,19,3,4,18,4,3,6,4,No,8
4,33,Travel_Rarely,Research & Development,1,Medical,2,Male,3,3,Manager,...,12,3,4,15,2,4,6,7,Yes,7


In [38]:
# Paso 7: Calcular el MonthlyIncome promedio por departamento
SueldoPromedioDepto = EmpleadosAttrition.groupby('Department')['MonthlyIncome'].mean().reset_index()
SueldoPromedioDepto.columns = ['Department', 'SueldoPromedio']

# Muestra el DataFrame SueldoPromedioDepto
print(SueldoPromedioDepto)


               Department  SueldoPromedio
0         Human Resources     6239.888889
1  Research & Development     6804.149813
2                   Sales     7192.609756


In [39]:
# Paso 8: Escalar la variable MonthlyIncome entre 0 y 1
min_value = EmpleadosAttrition['MonthlyIncome'].min()
max_value = EmpleadosAttrition['MonthlyIncome'].max()

EmpleadosAttrition['MonthlyIncome_scaled'] = (EmpleadosAttrition['MonthlyIncome'] - min_value) / (max_value - min_value)

# Muestra las primeras filas del dataframe después de escalar MonthlyIncome
EmpleadosAttrition.head()


Unnamed: 0,Age,BusinessTravel,Department,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,...,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition,YearsAtCompany,MonthlyIncome_scaled
0,50,Travel_Rarely,Research & Development,2,Medical,4,Male,3,4,Research Director,...,4,3,32,1,2,4,1,No,5,0.864269
1,36,Travel_Rarely,Research & Development,2,Medical,2,Male,3,2,Manufacturing Director,...,4,4,7,0,3,2,0,No,3,0.20734
2,21,Travel_Rarely,Sales,1,Marketing,2,Male,3,1,Sales Representative,...,3,2,1,3,3,0,1,Yes,1,0.088062
3,52,Travel_Rarely,Research & Development,4,Life Sciences,2,Male,3,3,Healthcare Representative,...,3,4,18,4,3,6,4,No,8,0.497574
4,33,Travel_Rarely,Research & Development,1,Medical,2,Male,3,3,Manager,...,3,4,15,2,4,6,7,Yes,7,0.66447


In [40]:
# Paso 9: Convertir variables categóricas a numéricas (one-hot encoding)
categorical_columns = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Attrition']

EmpleadosAttrition_encoded = pd.get_dummies(EmpleadosAttrition, columns=categorical_columns, drop_first=True)

# Muestra las primeras filas del dataframe después de la codificación
EmpleadosAttrition_encoded.head()


Unnamed: 0,Age,Education,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,Attrition_Yes
0,50,2,4,3,4,4,17399,9,No,22,...,False,False,False,True,False,False,False,False,False,False
1,36,2,2,3,2,2,4941,6,No,20,...,False,False,True,False,False,False,False,False,False,False
2,21,1,2,3,1,2,2679,1,No,13,...,False,False,False,False,False,False,True,False,True,True
3,52,4,2,3,3,2,10445,7,No,19,...,False,False,False,False,False,False,False,False,True,False
4,33,1,2,3,3,3,13610,7,Yes,12,...,False,True,False,False,False,False,False,True,False,True


In [48]:
# Convertir 'Attrition_Yes' a valores numéricos (1 para 'Yes', 0 para 'No')
EmpleadosAttrition_encoded['Attrition_Yes'] = EmpleadosAttrition_encoded['Attrition_Yes'].astype(int)

# Muestra las primeras filas del dataframe después de la corrección
EmpleadosAttrition_encoded.head()




Unnamed: 0,Age,Education,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,Attrition_Yes
0,50,2,4,3,4,4,17399,9,No,22,...,False,False,False,True,False,False,False,False,False,0
1,36,2,2,3,2,2,4941,6,No,20,...,False,False,True,False,False,False,False,False,False,0
2,21,1,2,3,1,2,2679,1,No,13,...,False,False,False,False,False,False,True,False,True,1
3,52,4,2,3,3,2,10445,7,No,19,...,False,False,False,False,False,False,False,False,True,0
4,33,1,2,3,3,3,13610,7,Yes,12,...,False,True,False,False,False,False,False,True,False,1


In [50]:
# Filtrar solo las columnas numéricas
numerical_columns = EmpleadosAttrition_encoded.select_dtypes(include=np.number).columns
correlation_matrix = EmpleadosAttrition_encoded[numerical_columns].corr()

# Calcular la correlación con respecto a Attrition_Yes
correlation_with_attrition = correlation_matrix['Attrition_Yes'].sort_values(ascending=False)

# Muestra la correlación de cada variable con respecto a Attrition_Yes
print(correlation_with_attrition)


Attrition_Yes               1.000000
PerformanceRating          -0.007010
NumCompaniesWorked         -0.008590
WorkLifeBalance            -0.023063
RelationshipSatisfaction   -0.032777
Education                  -0.054478
PercentSalaryHike          -0.061714
YearsSinceLastPromotion    -0.069835
TrainingTimesLastYear      -0.071592
EnvironmentSatisfaction    -0.125091
JobSatisfaction            -0.163964
JobInvolvement             -0.168047
YearsAtCompany             -0.176287
MonthlyIncome              -0.195044
MonthlyIncome_scaled       -0.195044
YearsInCurrentRole         -0.204453
Age                        -0.212459
TotalWorkingYears          -0.214043
JobLevel                   -0.214486
Name: Attrition_Yes, dtype: float64


In [54]:
# Seleccionar variables con correlación mayor o igual a 0.1
correlation_threshold = 0.1
selected_variables = correlation_with_attrition[abs(correlation_with_attrition) >= correlation_threshold].index

# Incluir la variable de salida 'Attrition_Yes'
selected_variables = selected_variables.union(['Attrition_Yes'])

# Crear el nuevo dataframe EmpleadosAttritionFinal
EmpleadosAttritionFinal = EmpleadosAttrition_encoded[selected_variables]

# Muestra las primeras filas del nuevo dataframe EmpleadosAttritionFinal
print(EmpleadosAttritionFinal.head())



   Age  Attrition_Yes  EnvironmentSatisfaction  JobInvolvement  JobLevel  \
0   50              0                        4               3         4   
1   36              0                        2               3         2   
2   21              1                        2               3         1   
3   52              0                        2               3         3   
4   33              1                        2               3         3   

   JobSatisfaction  MonthlyIncome  MonthlyIncome_scaled  TotalWorkingYears  \
0                4          17399              0.864269                 32   
1                2           4941              0.207340                  7   
2                2           2679              0.088062                  1   
3                2          10445              0.497574                 18   
4                3          13610              0.664470                 15   

   YearsAtCompany  YearsInCurrentRole  
0               5                 

In [55]:
from sklearn.decomposition import PCA

# Seleccionar las variables para el PCA (excluyendo la variable de salida 'Attrition_Yes')
variables_pca = EmpleadosAttritionFinal.drop(columns=['Attrition_Yes'])

# Inicializar el objeto PCA
pca = PCA()

# Ajustar el modelo PCA y transformar las variables seleccionadas
EmpleadosAttritionPCA_array = pca.fit_transform(variables_pca)

# Crear un nuevo dataframe con los componentes principales y la variable de salida
EmpleadosAttritionPCA = pd.DataFrame(data=EmpleadosAttritionPCA_array, columns=[f'Component_{i+1}' for i in range(EmpleadosAttritionPCA_array.shape[1])])
EmpleadosAttritionPCA['Attrition_Yes'] = EmpleadosAttritionFinal['Attrition_Yes']

# Muestra las primeras filas del dataframe EmpleadosAttritionPCA
EmpleadosAttritionPCA.head()


Unnamed: 0,Component_1,Component_2,Component_3,Component_4,Component_5,Component_6,Component_7,Component_8,Component_9,Component_10,Attrition_Yes
0,10487.845639,5.858857,-6.599735,8.618449,-1.88186,-0.188792,-2.075386,-0.463291,-0.525134,5.304359e-16,0.0
1,-1970.181427,-1.976138,-4.123017,-0.616593,0.199624,0.067823,0.940728,-0.290841,0.324514,-1.8744050000000002e-17,0.0
2,-4232.199477,-15.389148,-3.256094,2.014507,0.965858,0.03356,0.889047,-0.346968,-0.102261,-7.645348e-17,1.0
3,3533.841619,10.654538,-2.898597,-2.291162,-1.004101,0.159146,0.970993,-0.257282,0.023142,-4.486976e-16,0.0
4,6698.818587,-11.789553,-2.76706,1.177211,-1.775469,-0.580739,0.237701,-0.429366,-0.546616,1.327815e-16,1.0


In [57]:
from sklearn.decomposition import PCA

# Crear el objeto PCA
pca = PCA()

# Ajustar el modelo PCA a los datos de EmpleadosAttritionFinal
pca.fit(EmpleadosAttritionFinal.drop('Attrition_Yes', axis=1))

# Calcular el número de componentes necesarios para explicar el 80% de la varianza
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
num_components_for_80_variance = np.argmax(cumulative_variance_ratio >= 0.8) + 1

# Imprimir el número de componentes necesarios
print(f"Número de componentes para explicar el 80% de la varianza: {num_components_for_80_variance}")

# Ajustar el modelo PCA con el número de componentes necesario
pca_final = PCA(n_components=num_components_for_80_variance)
EmpleadosAttritionPCA_final = pca_final.fit_transform(EmpleadosAttritionFinal.drop('Attrition_Yes', axis=1))

# Agregar los componentes principales al frame original
for i in range(num_components_for_80_variance):
    EmpleadosAttritionFinal = EmpleadosAttritionFinal.assign(**{f'C{i}': EmpleadosAttritionPCA_final[:, i] for i in range(num_components_for_80_variance)})



Número de componentes para explicar el 80% de la varianza: 1


In [59]:
# Reorganizar las columnas para que "Attrition_Yes" esté al final
column_order = [col for col in EmpleadosAttritionFinal.columns if col != 'Attrition_Yes'] + ['Attrition_Yes']
EmpleadosAttritionFinal = EmpleadosAttritionFinal[column_order]

# Guardar el DataFrame en un archivo CSV
EmpleadosAttritionFinal.to_csv('EmpleadosAttritionFinal.csv', index=False)
