## Datos

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [16]:
# Cargar el archivo CSV
df = pd.read_csv('/content/drive/MyDrive/Practico Coink/Data Proyec Coink/info_satisfaccion_trabajo.csv')

In [17]:
# Mostrar datos
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,NumCompaniesWorked,OverTime,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,1,2,Female,...,8,Yes,3,1,8,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,2,3,Male,...,1,No,4,4,10,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,4,Male,...,6,Yes,3,2,7,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,5,4,Female,...,1,Yes,3,3,8,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,7,1,Male,...,9,No,3,4,6,3,2,2,2,2


In [18]:
# Lista de columnas para verificar valores nulos
columns_to_check = ['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
                    'EducationField', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'NumCompaniesWorked',
                    'OverTime', 'PerformanceRating', 'RelationshipSatisfaction', 'TotalWorkingYears', 'WorkLifeBalance',
                    'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

# Verificar si hay valores nulos en las columnas especificadas
null_check = df[columns_to_check].isnull().any()

# Mostrar las columnas con valores nulos y su cantidad
print("Columnas con valores nulos:")
print(null_check[null_check].index)
print("\nCantidad de valores nulos por columna:")
print(df[columns_to_check].isnull().sum())


Columnas con valores nulos:
Index([], dtype='object')

Cantidad de valores nulos por columna:
Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
NumCompaniesWorked          0
OverTime                    0
PerformanceRating           0
RelationshipSatisfaction    0
TotalWorkingYears           0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64


In [25]:
# Informacion de DF
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   int64 
 2   BusinessTravel            1470 non-null   int64 
 3   Department                1470 non-null   int64 
 4   DistanceFromHome          1470 non-null   int64 
 5   Education                 1470 non-null   int64 
 6   EducationField            1470 non-null   int64 
 7   EmployeeNumber            1470 non-null   object
 8   EnvironmentSatisfaction   1470 non-null   int64 
 9   Gender                    1470 non-null   int64 
 10  JobInvolvement            1470 non-null   int64 
 11  JobLevel                  1470 non-null   int64 
 12  JobRole                   1470 non-null   int64 
 13  JobSatisfaction           1470 non-null   int64 
 14  MaritalStatus           

## modificar tipo de datos y columnas

In [20]:
# Eliminar la columna 'YearsSinceLastPromotion'
df = df.drop('YearsSinceLastPromotion', axis=1)

In [21]:
#Informacion de JobSatisfaction
df["JobSatisfaction"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1470 entries, 0 to 1469
Series name: JobSatisfaction
Non-Null Count  Dtype
--------------  -----
1470 non-null   int64
dtypes: int64(1)
memory usage: 11.6 KB


In [22]:
# Cantidad de datos por grupo
df["JobSatisfaction"].value_counts()

JobSatisfaction
4    459
3    442
1    289
2    280
Name: count, dtype: int64

In [23]:
# Convertir 'Yes' a 1 y 'No' a 0 en la columna 'OverTime'
df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})

# Convertir 'BusinessTravel' a tipo numérico
df['BusinessTravel'] = df['BusinessTravel'].map({
    'Travel_Rarely': 1,
    'Travel_Frequently': 2,
    'Non-Travel': 0
})



# Convertir 'Department' a tipo numérico
df['Department'] = df['Department'].map({
    'Sales': 1,
    'Research & Development': 2,
    'Human Resources': 0
})

# Convertir 'Attrition' a tipo numérico
df['Attrition'] = df['Attrition'].map({
    'Yes': 1,
    'No': 0
})

# Aplicar mapeo a la columna
education_mapping = {
    'Life Sciences': 0,
    'Medical': 1,
    'Marketing': 2,
    'Technical Degree': 3,
    'Other': 4,
    'Human Resources': 5
}

# Aplicar el mapeo a la columna 'EducationField'
df['EducationField'] = df['EducationField'].map(education_mapping)


marital_mapping = {'Single': 0, 'Married': 1, 'Divorced': 2}

# Aplicar el mapeo a la columna 'MaritalStatus'
df['MaritalStatus'] = df['MaritalStatus'].map(marital_mapping)


# Convertir 'EmployeeNumber' a tipo objeto
df['EmployeeNumber'] = df['EmployeeNumber'].astype(str)

# Convertir a tipo num
gender_mapping = {
    'Male': 0,
    'Female': 1
}

# Aplicar el mapeo a la columna 'Gender'
df['Gender'] = df['Gender'].map(gender_mapping)

# Aplicar mapeo a la columna
jobrole_mapping = {
    'Sales Executive': 0,
    'Research Scientist': 1,
    'Laboratory Technician': 2,
    'Manufacturing Director': 3,
    'Healthcare Representative': 4,
    'Manager': 5,
    'Sales Representative': 6,
    'Research Director': 7,
    'Human Resources': 8
}

# Aplicar el mapeo a la columna 'JobRole'
df['JobRole'] = df['JobRole'].map(jobrole_mapping)



# Modelo

## Regresion Logistica

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.over_sampling import RandomOverSampler


In [26]:
df= df[["Education", "EnvironmentSatisfaction","JobInvolvement", "RelationshipSatisfaction","WorkLifeBalance", "PerformanceRating", "JobSatisfaction" ]]


### clasificacion para predecir la satisfaccion laboral

In [29]:

# Dividir los datos en conjuntos de entrenamiento y prueba
X = df.drop('JobSatisfaction', axis=1)
y = df['JobSatisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo (Regresión Logística)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluar el modelo
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)


Accuracy: 0.2755102040816326
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        66
           2       0.00      0.00      0.00        45
           3       0.28      0.37      0.32        94
           4       0.28      0.52      0.36        89

    accuracy                           0.28       294
   macro avg       0.14      0.22      0.17       294
weighted avg       0.17      0.28      0.21       294



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### optimizar resultado

In [31]:
# Balanceo de clases con RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = oversample.fit_resample(X_train, y_train)

# Ajuste de hiperparámetros con GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid_search.fit(X_resampled, y_resampled)

# Mejor modelo encontrado por GridSearchCV
best_model = grid_search.best_estimator_

# Validación cruzada con cross_val_score
cv_scores = cross_val_score(best_model, X_resampled, y_resampled, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {np.mean(cv_scores)}')

# Evaluar el modelo mejorado
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)



Cross-Validation Scores: [0.24324324 0.31756757 0.23986486 0.2972973  0.31756757]
Mean CV Accuracy: 0.2831081081081081
Accuracy: 0.22448979591836735
              precision    recall  f1-score   support

           1       0.19      0.33      0.24        66
           2       0.14      0.11      0.12        45
           3       0.28      0.26      0.27        94
           4       0.26      0.17      0.21        89

    accuracy                           0.22       294
   macro avg       0.22      0.22      0.21       294
weighted avg       0.23      0.22      0.22       294



In [32]:
#Entrenar modelo
best_model.fit(X_train, y_train)

In [34]:
#Prediccion del modelo
y_pred = best_model.predict(X_test)


#reporte de clasificacion
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)


Accuracy: 0.2755102040816326
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        66
           2       0.00      0.00      0.00        45
           3       0.28      0.37      0.32        94
           4       0.28      0.52      0.36        89

    accuracy                           0.28       294
   macro avg       0.14      0.22      0.17       294
weighted avg       0.17      0.28      0.21       294



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
