## 2. Chargement et Préparation des Données

### 2.1 Importation des données


In [4]:
import csv
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [5]:
# Charger le dataset
data_employer = pd.read_csv('datasets/data_Projet/HFY_condensed_data.csv',delimiter=';')
df_in_time = pd.read_csv("datasets/data_Projet/in_time.csv")
df_out_time = pd.read_csv("datasets/data_Projet/out_time.csv")
# Afficher les premières lignes du dataset
data_employer.head()


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,...,1.0,6,1,0,0,3,3,3.0,4.0,2.0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,...,6.0,3,5,1,4,2,4,3.0,2.0,4.0
2,32,No,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,...,5.0,2,5,0,3,3,3,2.0,2.0,1.0
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,...,13.0,5,8,7,5,2,3,4.0,4.0,3.0
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,...,9.0,2,6,0,4,3,3,4.0,1.0,3.0


In [6]:
data_employer.head(-1) 

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,...,1.0,6,1,0,0,3,3,3.0,4.0,2.0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,...,6.0,3,5,1,4,2,4,3.0,2.0,4.0
2,32,No,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,...,5.0,2,5,0,3,3,3,2.0,2.0,1.0
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,...,13.0,5,8,7,5,2,3,4.0,4.0,3.0
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,...,9.0,2,6,0,4,3,3,4.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4404,29,No,Travel_Rarely,Sales,4,3,Other,Female,2,Human Resources,...,6.0,2,6,1,5,2,3,3.0,4.0,3.0
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,Female,1,Research Scientist,...,10.0,5,3,0,2,3,3,4.0,1.0,3.0
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,Male,1,Laboratory Technician,...,10.0,2,3,0,2,2,3,4.0,4.0,3.0
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,Male,2,Sales Executive,...,5.0,4,4,1,2,3,4,1.0,3.0,3.0


In [7]:
# Display basic information
data_employer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   Gender                   4410 non-null   object 
 8   JobLevel                 4410 non-null   int64  
 9   JobRole                  4410 non-null   object 
 10  MaritalStatus            4410 non-null   object 
 11  MonthlyIncome            4410 non-null   int64  
 12  NumCompaniesWorked       4391 non-null   float64
 13  PercentSalaryHike        4410 non-null   int64  
 14  TotalWorkingYears       

## Remplissage des valeurs manquantes

In [8]:
data_employer.isna().sum(axis = 0)

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
PercentSalaryHike           0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
JobInvolvement              0
PerformanceRating           0
EnvironmentSatisfaction    25
JobSatisfaction            20
WorkLifeBalance            38
dtype: int64

### Encodage des variables catégorielles

In [10]:
label_enc_cols = ["Attrition","BusinessTravel", "Department", "EducationField", "Gender", "JobRole",  "MaritalStatus"]
for col in label_enc_cols:
    data_employer[col] = LabelEncoder().fit_transform(data_employer[col])


In [11]:
data_employer.head()
data_employer

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance
0,51,0,2,2,6,2,1,0,1,0,...,1.0,6,1,0,0,3,3,3.0,4.0,2.0
1,31,1,1,1,10,1,1,0,1,6,...,6.0,3,5,1,4,2,4,3.0,2.0,4.0
2,32,0,1,1,17,4,4,1,4,7,...,5.0,2,5,0,3,3,3,2.0,2.0,1.0
3,38,0,0,1,2,5,1,1,3,1,...,13.0,5,8,7,5,2,3,4.0,4.0,3.0
4,32,0,2,1,10,1,3,1,1,7,...,9.0,2,6,0,4,3,3,4.0,1.0,3.0


## Préparation des Données
### Gestion des valeurs manquantes

In [12]:
# Imputer les valeurs manquantes pour les variables numériques avec la médiane
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

# Sélection des colonnes numériques

housing_num = data_employer.select_dtypes(include=[np.number])
imputer.fit(housing_num)


# Remplacement des valeurs manquantes
housing_num_imputed = pd.DataFrame(imputer.transform(housing_num), columns=housing_num.columns)

# Vérifier s'il reste des valeurs manquantes
housing_num_imputed.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
PercentSalaryHike          0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
JobInvolvement             0
PerformanceRating          0
EnvironmentSatisfaction    0
JobSatisfaction            0
WorkLifeBalance            0
dtype: int64

In [84]:
# Display summary statistics
data_employer.describe()
data_employer.to_csv("data_employer.csv")

In [14]:
for e in label_enc_cols:
    data_employer[e].value_counts()
#data_employer["Attrition"].value_counts()
#data_employer["BusinessTravel"].value_counts()
#data_employer["Department"].value_counts()
#data_employer["EducationField"].value_counts()
#data_employer["Gender"].value_counts()
data_employer["JobRole"].value_counts()
#data_employer["MaritalStatus"].value_counts()#SOLUTION

JobRole
7    978
6    876
2    777
4    435
0    393
3    306
8    249
5    240
1    156
Name: count, dtype: int64

In [15]:
df_in_time.head(-1)

Unnamed: 0,EmployeeID,2015-01-01,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,...,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-25,2015-12-28,2015-12-29,2015-12-30,2015-12-31
0,1,,2015-01-02 09:43:45,2015-01-05 10:08:48,2015-01-06 09:54:26,2015-01-07 09:34:31,2015-01-08 09:51:09,2015-01-09 10:09:25,2015-01-12 09:42:53,2015-01-13 10:13:06,...,,2015-12-21 09:55:29,2015-12-22 10:04:06,2015-12-23 10:14:27,2015-12-24 10:11:35,,2015-12-28 10:13:41,2015-12-29 10:03:36,2015-12-30 09:54:12,2015-12-31 10:12:44
1,2,,2015-01-02 10:15:44,2015-01-05 10:21:05,,2015-01-07 09:45:17,2015-01-08 10:09:04,2015-01-09 09:43:26,2015-01-12 10:00:07,2015-01-13 10:43:29,...,2015-12-18 10:37:17,2015-12-21 09:49:02,2015-12-22 10:33:51,2015-12-23 10:12:10,,,2015-12-28 09:31:45,2015-12-29 09:55:49,2015-12-30 10:32:25,2015-12-31 09:27:20
2,3,,2015-01-02 10:17:41,2015-01-05 09:50:50,2015-01-06 10:14:13,2015-01-07 09:47:27,2015-01-08 10:03:40,2015-01-09 10:05:49,2015-01-12 10:03:47,2015-01-13 10:21:26,...,2015-12-18 10:15:14,2015-12-21 10:10:28,2015-12-22 09:44:44,2015-12-23 10:15:54,2015-12-24 10:07:26,,2015-12-28 09:42:05,2015-12-29 09:43:36,2015-12-30 09:34:05,2015-12-31 10:28:39
3,4,,2015-01-02 10:05:06,2015-01-05 09:56:32,2015-01-06 10:11:07,2015-01-07 09:37:30,2015-01-08 10:02:08,2015-01-09 10:08:12,2015-01-12 10:13:42,2015-01-13 09:53:22,...,2015-12-18 10:17:38,2015-12-21 09:58:21,2015-12-22 10:04:25,2015-12-23 10:11:46,2015-12-24 09:43:15,,2015-12-28 09:52:44,2015-12-29 09:33:16,2015-12-30 10:18:12,2015-12-31 10:01:15
4,5,,2015-01-02 10:28:17,2015-01-05 09:49:58,2015-01-06 09:45:28,2015-01-07 09:49:37,2015-01-08 10:19:44,2015-01-09 10:00:50,2015-01-12 10:29:27,2015-01-13 09:59:32,...,2015-12-18 09:58:35,2015-12-21 10:03:41,2015-12-22 10:10:30,2015-12-23 10:13:36,2015-12-24 09:44:24,,2015-12-28 10:05:15,2015-12-29 10:30:53,2015-12-30 09:18:21,2015-12-31 09:41:09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4404,4405,,2015-01-02 10:18:28,2015-01-05 09:42:42,2015-01-06 10:08:36,2015-01-07 09:43:44,2015-01-08 10:01:57,2015-01-09 10:09:09,2015-01-12 09:52:17,2015-01-13 09:47:41,...,2015-12-18 09:48:29,2015-12-21 10:12:51,2015-12-22 10:15:09,2015-12-23 10:05:50,2015-12-24 10:30:00,,2015-12-28 10:07:07,2015-12-29 09:57:23,2015-12-30 09:23:37,2015-12-31 09:50:05
4405,4406,,2015-01-02 09:20:32,2015-01-05 10:17:53,2015-01-06 10:26:51,2015-01-07 10:06:58,2015-01-08 09:45:06,2015-01-09 09:49:24,2015-01-12 09:37:10,2015-01-13 09:25:02,...,2015-12-18 10:01:06,2015-12-21 10:25:25,2015-12-22 10:16:11,2015-12-23 10:04:40,2015-12-24 09:45:40,,2015-12-28 10:15:39,2015-12-29 10:10:09,2015-12-30 09:28:19,2015-12-31 10:00:12
4406,4407,,2015-01-02 10:03:41,,2015-01-06 09:44:00,2015-01-07 09:42:10,2015-01-08 10:00:57,2015-01-09 09:44:04,2015-01-12 10:07:32,2015-01-13 10:05:11,...,2015-12-18 09:27:32,2015-12-21 09:41:24,2015-12-22 09:50:30,2015-12-23 10:32:21,2015-12-24 09:47:41,,2015-12-28 09:54:23,2015-12-29 10:13:32,2015-12-30 10:21:09,2015-12-31 10:09:48
4407,4408,,2015-01-02 10:01:01,2015-01-05 09:33:00,2015-01-06 09:49:17,2015-01-07 10:28:12,2015-01-08 09:47:38,2015-01-09 10:01:03,2015-01-12 09:49:12,2015-01-13 09:47:10,...,2015-12-18 10:00:57,2015-12-21 09:51:07,2015-12-22 10:02:10,2015-12-23 09:58:29,2015-12-24 09:56:05,,2015-12-28 09:59:24,,2015-12-30 10:02:36,2015-12-31 10:03:30


In [17]:
df_in_time.describe()

Unnamed: 0,EmployeeID,2015-01-01,2015-01-14,2015-01-26,2015-03-05,2015-05-01,2015-07-17,2015-09-17,2015-10-02,2015-11-09,2015-11-10,2015-11-11,2015-12-25
count,4410.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,2205.5,,,,,,,,,,,,
std,1273.201673,,,,,,,,,,,,
min,1.0,,,,,,,,,,,,
25%,1103.25,,,,,,,,,,,,
50%,2205.5,,,,,,,,,,,,
75%,3307.75,,,,,,,,,,,,
max,4410.0,,,,,,,,,,,,


In [18]:
df_in_time.info

<bound method DataFrame.info of       EmployeeID  2015-01-01           2015-01-02           2015-01-05  \
0              1         NaN  2015-01-02 09:43:45  2015-01-05 10:08:48   
1              2         NaN  2015-01-02 10:15:44  2015-01-05 10:21:05   
2              3         NaN  2015-01-02 10:17:41  2015-01-05 09:50:50   
3              4         NaN  2015-01-02 10:05:06  2015-01-05 09:56:32   
4              5         NaN  2015-01-02 10:28:17  2015-01-05 09:49:58   
...          ...         ...                  ...                  ...   
4405        4406         NaN  2015-01-02 09:20:32  2015-01-05 10:17:53   
4406        4407         NaN  2015-01-02 10:03:41                  NaN   
4407        4408         NaN  2015-01-02 10:01:01  2015-01-05 09:33:00   
4408        4409         NaN  2015-01-02 10:17:05  2015-01-05 10:02:27   
4409        4410         NaN  2015-01-02 09:59:09  2015-01-05 10:16:14   

               2015-01-06           2015-01-07           2015-01-08  \
0     20

In [82]:
df_in_time = df_in_time.apply(pd.to_datetime, errors='coerce')
df_out_time = df_out_time.apply(pd.to_datetime, errors='coerce')
df_in_time_cleaned = df_in_time.dropna(axis=1, how='all')
df_in_time_cleaned.head()

Unnamed: 0_level_0,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-15,2015-01-16,...,2015-12-17,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-28,2015-12-29,2015-12-30,2015-12-31
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2015-01-02 09:43:45,2015-01-05 10:08:48,2015-01-06 09:54:26,2015-01-07 09:34:31,2015-01-08 09:51:09,2015-01-09 10:09:25,2015-01-12 09:42:53,2015-01-13 10:13:06,2015-01-15 10:01:24,2015-01-16 10:19:08,...,NaT,NaT,2015-12-21 09:55:29,2015-12-22 10:04:06,2015-12-23 10:14:27,2015-12-24 10:11:35,2015-12-28 10:13:41,2015-12-29 10:03:36,2015-12-30 09:54:12,2015-12-31 10:12:44
2,2015-01-02 10:15:44,2015-01-05 10:21:05,NaT,2015-01-07 09:45:17,2015-01-08 10:09:04,2015-01-09 09:43:26,2015-01-12 10:00:07,2015-01-13 10:43:29,2015-01-15 09:37:57,2015-01-16 09:57:18,...,2015-12-17 09:15:08,2015-12-18 10:37:17,2015-12-21 09:49:02,2015-12-22 10:33:51,2015-12-23 10:12:10,NaT,2015-12-28 09:31:45,2015-12-29 09:55:49,2015-12-30 10:32:25,2015-12-31 09:27:20
3,2015-01-02 10:17:41,2015-01-05 09:50:50,2015-01-06 10:14:13,2015-01-07 09:47:27,2015-01-08 10:03:40,2015-01-09 10:05:49,2015-01-12 10:03:47,2015-01-13 10:21:26,2015-01-15 09:55:11,2015-01-16 10:05:36,...,2015-12-17 09:53:17,2015-12-18 10:15:14,2015-12-21 10:10:28,2015-12-22 09:44:44,2015-12-23 10:15:54,2015-12-24 10:07:26,2015-12-28 09:42:05,2015-12-29 09:43:36,2015-12-30 09:34:05,2015-12-31 10:28:39
4,2015-01-02 10:05:06,2015-01-05 09:56:32,2015-01-06 10:11:07,2015-01-07 09:37:30,2015-01-08 10:02:08,2015-01-09 10:08:12,2015-01-12 10:13:42,2015-01-13 09:53:22,2015-01-15 10:00:50,2015-01-16 09:58:06,...,2015-12-17 09:54:36,2015-12-18 10:17:38,2015-12-21 09:58:21,2015-12-22 10:04:25,2015-12-23 10:11:46,2015-12-24 09:43:15,2015-12-28 09:52:44,2015-12-29 09:33:16,2015-12-30 10:18:12,2015-12-31 10:01:15
5,2015-01-02 10:28:17,2015-01-05 09:49:58,2015-01-06 09:45:28,2015-01-07 09:49:37,2015-01-08 10:19:44,2015-01-09 10:00:50,2015-01-12 10:29:27,2015-01-13 09:59:32,2015-01-15 10:06:12,2015-01-16 10:03:50,...,2015-12-17 09:46:35,2015-12-18 09:58:35,2015-12-21 10:03:41,2015-12-22 10:10:30,2015-12-23 10:13:36,2015-12-24 09:44:24,2015-12-28 10:05:15,2015-12-29 10:30:53,2015-12-30 09:18:21,2015-12-31 09:41:09


In [16]:
df_in_time_cleaned.describe()

NameError: name 'df_in_time_cleaned' is not defined

In [24]:
# Convertir en datetime (pour s'assurer que les valeurs sont bien reconnues comme des dates)
df_in_time = df_in_time.apply(pd.to_datetime, errors='coerce')
df_out_time = df_out_time.apply(pd.to_datetime, errors='coerce')

# Supprimer les jours où tout le monde est absent
df_in_time_cleaned = df_in_time.dropna(axis=1, how='all')
df_out_time_cleaned = df_out_time.dropna(axis=1, how='all')

# Vérifier combien de jours restent
print(f"Nombre de jours restants après nettoyage : {df_in_time_cleaned.shape[1] - 1}")


Nombre de jours restants après nettoyage : 249


In [38]:
# Renommer la première colonne en "EmployeeID"
df_in_time.rename(columns={df_in_time.columns[0]: "EmployeeID"}, inplace=True)
# Vérifier à nouveau les colonnes
print(df_in_time.head())

                     EmployeeID 2015-01-01          2015-01-02  \
0 1970-01-01 00:00:00.000000001        NaT 2015-01-02 09:43:45   
1 1970-01-01 00:00:00.000000002        NaT 2015-01-02 10:15:44   
2 1970-01-01 00:00:00.000000003        NaT 2015-01-02 10:17:41   
3 1970-01-01 00:00:00.000000004        NaT 2015-01-02 10:05:06   
4 1970-01-01 00:00:00.000000005        NaT 2015-01-02 10:28:17   

           2015-01-05          2015-01-06          2015-01-07  \
0 2015-01-05 10:08:48 2015-01-06 09:54:26 2015-01-07 09:34:31   
1 2015-01-05 10:21:05                 NaT 2015-01-07 09:45:17   
2 2015-01-05 09:50:50 2015-01-06 10:14:13 2015-01-07 09:47:27   
3 2015-01-05 09:56:32 2015-01-06 10:11:07 2015-01-07 09:37:30   
4 2015-01-05 09:49:58 2015-01-06 09:45:28 2015-01-07 09:49:37   

           2015-01-08          2015-01-09          2015-01-12  \
0 2015-01-08 09:51:09 2015-01-09 10:09:25 2015-01-12 09:42:53   
1 2015-01-08 10:09:04 2015-01-09 09:43:26 2015-01-12 10:00:07   
2 2015-01-08 10:0

In [51]:
df_in_time.head()

Unnamed: 0_level_0,2015-01-01,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-14,...,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-25,2015-12-28,2015-12-29,2015-12-30,2015-12-31
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,NaT,2015-01-02 09:43:45,2015-01-05 10:08:48,2015-01-06 09:54:26,2015-01-07 09:34:31,2015-01-08 09:51:09,2015-01-09 10:09:25,2015-01-12 09:42:53,2015-01-13 10:13:06,NaT,...,NaT,2015-12-21 09:55:29,2015-12-22 10:04:06,2015-12-23 10:14:27,2015-12-24 10:11:35,NaT,2015-12-28 10:13:41,2015-12-29 10:03:36,2015-12-30 09:54:12,2015-12-31 10:12:44
2,NaT,2015-01-02 10:15:44,2015-01-05 10:21:05,NaT,2015-01-07 09:45:17,2015-01-08 10:09:04,2015-01-09 09:43:26,2015-01-12 10:00:07,2015-01-13 10:43:29,NaT,...,2015-12-18 10:37:17,2015-12-21 09:49:02,2015-12-22 10:33:51,2015-12-23 10:12:10,NaT,NaT,2015-12-28 09:31:45,2015-12-29 09:55:49,2015-12-30 10:32:25,2015-12-31 09:27:20
3,NaT,2015-01-02 10:17:41,2015-01-05 09:50:50,2015-01-06 10:14:13,2015-01-07 09:47:27,2015-01-08 10:03:40,2015-01-09 10:05:49,2015-01-12 10:03:47,2015-01-13 10:21:26,NaT,...,2015-12-18 10:15:14,2015-12-21 10:10:28,2015-12-22 09:44:44,2015-12-23 10:15:54,2015-12-24 10:07:26,NaT,2015-12-28 09:42:05,2015-12-29 09:43:36,2015-12-30 09:34:05,2015-12-31 10:28:39
4,NaT,2015-01-02 10:05:06,2015-01-05 09:56:32,2015-01-06 10:11:07,2015-01-07 09:37:30,2015-01-08 10:02:08,2015-01-09 10:08:12,2015-01-12 10:13:42,2015-01-13 09:53:22,NaT,...,2015-12-18 10:17:38,2015-12-21 09:58:21,2015-12-22 10:04:25,2015-12-23 10:11:46,2015-12-24 09:43:15,NaT,2015-12-28 09:52:44,2015-12-29 09:33:16,2015-12-30 10:18:12,2015-12-31 10:01:15
5,NaT,2015-01-02 10:28:17,2015-01-05 09:49:58,2015-01-06 09:45:28,2015-01-07 09:49:37,2015-01-08 10:19:44,2015-01-09 10:00:50,2015-01-12 10:29:27,2015-01-13 09:59:32,NaT,...,2015-12-18 09:58:35,2015-12-21 10:03:41,2015-12-22 10:10:30,2015-12-23 10:13:36,2015-12-24 09:44:24,NaT,2015-12-28 10:05:15,2015-12-29 10:30:53,2015-12-30 09:18:21,2015-12-31 09:41:09


In [56]:
# Charger le fichier en mettant la première colonne comme index
df_in_time = pd.read_csv("datasets/data_Projet/in_time.csv", index_col=0)
df_out_time = pd.read_csv("datasets/data_Projet/out_time.csv", index_col=0)
# Récupérer l'heure d'arrivée de l'employé 1 le 1er janvier 2015
arrival_time = df_in_time.loc[3, "2015-01-02"]

print(f"L'employé 2 est arrivé à : {arrival_time}")
if pd.isna(arrival_time):
    print("L'employé 1 était absent ce jour-là.")


L'employé 2 est arrivé à : 2015-01-02 10:17:41


In [75]:
# Créer un DataFrame pour stocker les résultats
df_work_hours = pd.DataFrame(index=df_in_time.index, columns=df_in_time.columns)

# Boucler sur chaque date et chaque employé
for day in df_in_time.columns[1:]:  # On ignore la première colonne qui est "EmployeeID"
    for employee_id in df_in_time.index:
        
        arrival_time = df_in_time.loc[employee_id, day]
        departure_time = df_out_time.loc[employee_id, day]
        

        # Vérifier si les valeurs ne sont pas NaN
        if pd.notna(arrival_time) and pd.notna(departure_time):
            # Calculer la durée en heures
            
            work_duration = departure_time - arrival_time
            work_duration_str = str(work_duration).split()[-1]
            print(work_duration_str)
            work_duration = work_duration_str
        else: 
            work_duration = None  # Laisser vide si absent
        
        # Stocker le résultat
        df_work_hours.loc[employee_id, day] = work_duration

# Sauvegarder dans un fichier CSV
df_work_hours.to_csv("test.csv")

07:12:30
08:06:33
06:41:33
07:20:18
08:03:20
10:46:46
06:50:18
07:05:25
07:21:38
07:37:14
07:15:25
09:15:07
06:18:52
06:25:22
06:49:09
10:59:43
06:56:51
09:22:57
07:25:17
06:14:56
08:54:43
06:28:02
07:10:58
08:57:58
11:10:58
08:01:27
10:46:39
09:40:14
07:14:28
10:29:40
06:25:07
06:20:10
09:44:27
06:18:49
06:13:28
07:15:19
07:28:55
07:50:40
07:04:50
07:31:37
07:27:17
07:30:34
07:25:52
07:36:43
07:11:35
10:47:55
09:33:53
06:37:14
07:44:30
08:36:24
06:46:40
11:00:27
07:51:49
09:40:14
07:55:37
10:30:57
10:26:02
09:56:28
06:54:10
07:13:15
11:15:20
06:14:52
06:16:43
05:28:43
09:42:52
07:42:08
06:40:58
06:44:17
09:28:25
05:58:35
06:52:27
07:39:36
10:22:27
07:00:16
07:34:04
05:57:26
06:00:18
10:19:37
08:24:22
08:04:10
06:48:07
08:01:30
06:34:19
06:19:24
08:44:44
08:01:50
11:00:01
10:13:24
05:56:23
05:32:51
08:04:03
07:34:42
06:59:20
05:55:39
07:48:57
08:09:14
10:39:02
07:49:35
06:04:24
07:16:38
09:28:24
10:35:42
06:23:43
07:44:10
08:08:51
07:27:22
07:52:20
07:26:24
05:49:31
09:39:27
08:38:54
1

KeyboardInterrupt: 

In [50]:
db_time_HHMMSS = pd.read_csv("datasets/data_Projet/work_hours.csv")
db_time_HHMMSS= db_time_HHMMSS.dropna(axis=1, how='all')

for 

Unnamed: 0.1,Unnamed: 0,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-15,...,2015-12-17,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-28,2015-12-29,2015-12-30,2015-12-31
0,0,0 days 07:12:30,0 days 07:11:23,0 days 07:24:39,0 days 07:00:24,0 days 07:17:23,0 days 07:29:04,0 days 07:15:46,0 days 07:49:52,0 days 07:20:49,...,,,0 days 07:20:21,0 days 07:23:45,0 days 06:30:17,0 days 07:35:47,0 days 07:46:26,0 days 07:18:54,0 days 07:46:44,0 days 07:04:49
1,1,0 days 08:06:33,0 days 07:27:17,,0 days 07:23:49,0 days 07:25:00,0 days 07:09:03,0 days 07:36:41,0 days 07:16:44,0 days 07:36:47,...,0 days 07:55:42,0 days 07:54:11,0 days 07:45:14,0 days 07:42:44,0 days 07:26:08,,0 days 07:36:53,0 days 07:58:57,0 days 07:59:10,0 days 08:13:38
2,2,0 days 06:41:33,0 days 07:15:56,0 days 06:24:19,0 days 06:45:54,0 days 07:20:42,0 days 06:51:41,0 days 07:25:07,0 days 06:59:59,0 days 07:26:18,...,0 days 07:13:06,0 days 06:47:09,0 days 07:09:49,0 days 06:48:06,0 days 06:43:49,0 days 06:50:59,0 days 07:01:26,0 days 07:26:20,0 days 07:32:20,0 days 06:47:11
3,3,0 days 07:20:18,0 days 07:17:31,0 days 06:56:35,0 days 06:55:10,0 days 06:51:03,0 days 07:11:35,0 days 06:59:55,0 days 07:18:23,0 days 06:52:36,...,0 days 07:27:21,0 days 07:37:45,0 days 06:50:48,0 days 07:19:35,0 days 07:24:49,0 days 07:05:06,0 days 07:26:50,0 days 07:25:00,0 days 07:21:59,0 days 07:07:59
4,4,0 days 08:03:20,0 days 07:59:17,0 days 07:40:57,0 days 07:48:22,0 days 07:39:44,0 days 07:43:18,0 days 08:21:54,0 days 08:15:26,0 days 08:15:36,...,0 days 08:19:12,0 days 07:54:13,0 days 07:39:54,0 days 07:57:27,0 days 07:47:13,0 days 08:14:58,0 days 07:39:44,0 days 08:16:07,0 days 07:57:12,0 days 08:01:05


code génére par le gpt on vas voir ce qu'il fait mais on le supp après 

In [78]:
import pandas as pd



# Supprimer les jours où tout le monde est absent
jours_ouverts = df_in_time.columns[df_in_time.notna().sum() > 0]
df_in_time = df_in_time[jours_ouverts]
df_out_time = df_out_time[jours_ouverts]

# Créer un DataFrame pour stocker les heures travaillées
df_work_hours_zebi = pd.DataFrame(index=df_in_time.index, columns=df_in_time.columns)

# Calcul du temps de travail par jour
for day in jours_ouverts:
    for employee_id in df_in_time.index:
        arrival_time = df_in_time.loc[employee_id, day]
        departure_time = df_out_time.loc[employee_id, day]

        # Vérifier si les valeurs ne sont pas NaN
        if pd.notna(arrival_time) and pd.notna(departure_time):
            work_duration = (departure_time - arrival_time).total_seconds() / 3600  # Convertir en heures
        else:
            work_duration = None  # Absence

        # Stocker le temps de travail en heures
        df_work_hours.loc[employee_id, day] = work_duration

# Calcul de la moyenne des heures travaillées par employé sur les jours ouverts
average_work_hours_per_employee = df_work_hours.mean(axis=1, skipna=True)

# Sauvegarder dans un fichier CSV
average_work_hours_per_employee.to_csv("average_work_hours_per_employee.csv", header=["df_work_hours_zebi"])

print("Fichier 'average_work_hours_per_employee.csv' généré avec succès ! 🎯")


Fichier 'average_work_hours_per_employee.csv' généré avec succès ! 🎯


In [86]:
# Convertir les valeurs en datetime
df_in_time = df_in_time.apply(pd.to_datetime, errors='coerce')
df_out_time = df_out_time.apply(pd.to_datetime, errors='coerce')

# Supprimer les jours où tout le monde est absent
jours_ouverts = df_in_time.columns[df_in_time.notna().sum() > 0]
df_in_time = df_in_time[jours_ouverts]
df_out_time = df_out_time[jours_ouverts]

# Créer un DataFrame pour stocker les heures travaillées
df_average_work_hours_HHMM = pd.DataFrame(index=df_in_time.index, columns=df_in_time.columns)

# Calcul du temps de travail par jour
for day in jours_ouverts:
    for employee_id in df_in_time.index:
        arrival_time = df_in_time.loc[employee_id, day]
        departure_time = df_out_time.loc[employee_id, day]

        # Vérifier si les valeurs ne sont pas NaN
        if pd.notna(arrival_time) and pd.notna(departure_time):
            work_duration = (departure_time - arrival_time).total_seconds() / 3600  # Convertir en heures
        else:
            work_duration = None  # Absence

        # Stocker le temps de travail en heures
        df_work_hours.loc[employee_id, day] = work_duration

# Calcul de la moyenne des heures travaillées par employé
average_work_hours_per_employee = df_work_hours.mean(axis=1, skipna=True)

# Convertir la moyenne en format HH:MM
average_work_hours_formatted = average_work_hours_per_employee.apply(
    lambda x: f"{int(x):02d}:{int((x - int(x)) * 60):02d}" if pd.notna(x) else None
)

# Sauvegarder dans un fichier CSV
average_work_hours_formatted.to_csv("df_average_work_hours_HHMM.csv", header=["Average_Work_Hours_HHMM"])


In [85]:
df_work = pd.read_csv("datasets/data_Projet/work_hours_HHMM.csv")
df_work.head()

Unnamed: 0,EmployeeID,2015-01-01,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,...,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-25,2015-12-28,2015-12-29,2015-12-30,2015-12-31
0,1,,09:43 - 16:56 (7.21h),10:08 - 17:20 (7.19h),09:54 - 17:19 (7.41h),09:34 - 16:34 (7.01h),09:51 - 17:08 (7.29h),10:09 - 17:38 (7.48h),09:42 - 16:58 (7.26h),10:13 - 18:02 (7.83h),...,Absent,09:55 - 17:15 (7.34h),10:04 - 17:27 (7.40h),10:14 - 16:44 (6.50h),10:11 - 17:47 (7.60h),Absent,10:13 - 18:00 (7.77h),10:03 - 17:22 (7.32h),09:54 - 17:40 (7.78h),10:12 - 17:17 (7.08h)
1,2,,10:15 - 18:22 (8.11h),10:21 - 17:48 (7.45h),Absent,09:45 - 17:09 (7.40h),10:09 - 17:34 (7.42h),09:43 - 16:52 (7.15h),10:00 - 17:36 (7.61h),10:43 - 18:00 (7.28h),...,10:37 - 18:31 (7.90h),09:49 - 17:34 (7.75h),10:33 - 18:16 (7.71h),10:12 - 17:38 (7.44h),Absent,Absent,09:31 - 17:08 (7.61h),09:55 - 17:54 (7.98h),10:32 - 18:31 (7.99h),09:27 - 17:40 (8.23h)
2,3,,10:17 - 16:59 (6.69h),09:50 - 17:06 (7.27h),10:14 - 16:38 (6.41h),09:47 - 16:33 (6.76h),10:03 - 17:24 (7.34h),10:05 - 16:57 (6.86h),10:03 - 17:28 (7.42h),10:21 - 17:21 (7.00h),...,10:15 - 17:02 (6.79h),10:10 - 17:20 (7.16h),09:44 - 16:32 (6.80h),10:15 - 16:59 (6.73h),10:07 - 16:58 (6.85h),Absent,09:42 - 16:43 (7.02h),09:43 - 17:09 (7.44h),09:34 - 17:06 (7.54h),10:28 - 17:15 (6.79h)
3,4,,10:05 - 17:25 (7.34h),09:56 - 17:14 (7.29h),10:11 - 17:07 (6.94h),09:37 - 16:32 (6.92h),10:02 - 16:53 (6.85h),10:08 - 17:19 (7.19h),10:13 - 17:13 (7.00h),09:53 - 17:11 (7.31h),...,10:17 - 17:55 (7.63h),09:58 - 16:49 (6.85h),10:04 - 17:24 (7.33h),10:11 - 17:36 (7.41h),09:43 - 16:48 (7.08h),Absent,09:52 - 17:19 (7.45h),09:33 - 16:58 (7.42h),10:18 - 17:40 (7.37h),10:01 - 17:09 (7.13h)
4,5,,10:28 - 18:31 (8.06h),09:49 - 17:49 (7.99h),09:45 - 17:26 (7.68h),09:49 - 17:37 (7.81h),10:19 - 17:59 (7.66h),10:00 - 17:44 (7.72h),10:29 - 18:51 (8.37h),09:59 - 18:14 (8.26h),...,09:58 - 17:52 (7.90h),10:03 - 17:43 (7.67h),10:10 - 18:07 (7.96h),10:13 - 18:00 (7.79h),09:44 - 17:59 (8.25h),Absent,10:05 - 17:44 (7.66h),10:30 - 18:47 (8.27h),09:18 - 17:15 (7.95h),09:41 - 17:42 (8.02h)


In [81]:
df_average_work_hours_HHMM.head()

Unnamed: 0_level_0,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-15,2015-01-16,...,2015-12-17,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-28,2015-12-29,2015-12-30,2015-12-31
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


# IA entrainement 


In [None]:
data_employer = pd.read_csv('datasets/data_Projet/data_employer.csv',delimiter=';')
data_heure = pd.read_csv('datasets/data_Projet/.csv',delimiter=';')

In [None]:
df_merge = pd.merge(data_employer)
