# Importation des bibliothèques

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer

# Importation des datasets 

In [2]:
t_arr = pd.read_csv("in_time.csv")
t_dép = pd.read_csv("out_time.csv")
emp_survey = pd.read_csv("employee_survey_data.csv")
gen_data = pd.read_csv("general_data.csv")
man_survey = pd.read_csv("manager_survey_data.csv")      

In [3]:
missing_rates = t_arr.isnull().mean()
print(missing_rates)

Unnamed: 0    0.000000
2015-01-01    1.000000
2015-01-02    0.047392
2015-01-05    0.046712
2015-01-06    0.051701
                ...   
2015-12-25    1.000000
2015-12-28    0.053061
2015-12-29    0.052154
2015-12-30    0.060091
2015-12-31    0.048299
Length: 262, dtype: float64


In [4]:
missing_rates = t_dép.isnull().mean()
print(missing_rates)

Unnamed: 0    0.000000
2015-01-01    1.000000
2015-01-02    0.047392
2015-01-05    0.046712
2015-01-06    0.051701
                ...   
2015-12-25    1.000000
2015-12-28    0.053061
2015-12-29    0.052154
2015-12-30    0.060091
2015-12-31    0.048299
Length: 262, dtype: float64


# Nettoyage des données

## Taux de valeurs manquantes 

In [5]:
# Fonction pour convertir datetime.time en secondes depuis minuit
def time_to_minutes(time_obj):
    """Convertit un objet datetime.time en secondes depuis minuit."""
    if time_obj is not pd.NaT:  # Vérifie si la valeur n'est pas NaN
        return time_obj.hour * 60 + time_obj.minute * 60 + time_obj.second/60
    return None

## Nettoyage du dataset des temps d'arrivée

In [6]:
threshold = 0.9
column_to_drop = missing_rates[missing_rates>= threshold].index
t_arr_cleaned = t_arr.drop(columns=column_to_drop)
t_arr_cleaned = t_arr_cleaned.drop(columns="Unnamed: 0")
t_arr_cleaned.shape

(4410, 249)

In [7]:
# conversion en date time 
t_arr_cleaned.columns = pd.to_datetime(t_arr_cleaned.columns)
t_arr_converted = t_arr_cleaned.apply(pd.to_datetime)
t_arr_converted = t_arr_converted.apply(lambda col: col.dt.time)

# Appliquer la conversion en secondes sur votre DataFrame
t_arr_seconds = t_arr_converted.applymap(lambda x: time_to_minutes(x))
# Calculer la médiane des secondes pour chaque colonne
medians = t_arr_seconds.median()
# Remplacer les valeurs manquantes par la médiane en secondes
t_arr_seconds_filled = t_arr_seconds.apply(lambda col: col.fillna(medians[col.name]))
# Afficher le DataFrame final avec les valeurs manquantes remplies
t_arr_seconds_filled


  t_arr_seconds = t_arr_converted.applymap(lambda x: time_to_minutes(x))


Unnamed: 0,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-15,2015-01-16,...,2015-12-17,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-28,2015-12-29,2015-12-30,2015-12-31
0,3120.750000,1080.800000,3780.433333,2580.516667,3600.150000,1140.416667,3060.883333,1380.100000,660.400000,1740.133333,...,2340.950000,2280.800000,3840.483333,840.100000,1440.450000,1260.583333,1380.683333,780.600000,3780.200000,1320.733333
1,1500.733333,1860.083333,2400.383333,3240.283333,1140.066667,3120.433333,600.116667,3180.483333,2760.950000,3960.300000,...,1440.133333,2820.283333,3480.033333,2580.850000,1320.166667,2340.616667,2400.750000,3840.816667,2520.416667,2160.333333
2,1620.683333,3540.833333,1440.216667,3360.450000,780.666667,900.816667,780.783333,1860.433333,3840.183333,900.600000,...,3720.283333,1500.233333,1200.466667,3180.733333,1500.900000,1020.433333,3060.083333,3120.600000,2580.083333,2280.650000
3,900.100000,3900.533333,1260.116667,2760.500000,720.133333,1080.200000,1380.700000,3720.366667,600.833333,4020.100000,...,3780.600000,1620.633333,4020.350000,840.416667,1260.766667,3120.250000,3660.733333,2520.266667,1680.200000,660.250000
4,2280.283333,3480.966667,3240.466667,3480.616667,1740.733333,600.833333,2340.450000,4080.533333,960.200000,780.833333,...,3300.583333,4020.583333,780.683333,1200.500000,1380.600000,3180.400000,900.250000,2400.883333,1620.350000,3000.150000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,1740.533333,1620.883333,2160.850000,960.966667,3240.100000,3480.400000,2760.166667,2040.033333,2280.283333,3480.616667,...,900.366667,660.100000,2100.416667,1560.183333,840.666667,3240.666667,1500.650000,1200.150000,2220.316667,600.200000
4406,780.683333,2400.475000,3180.000000,3060.166667,600.950000,3180.066667,1020.533333,900.183333,1680.183333,3840.083333,...,3240.816667,2160.533333,3000.400000,3540.500000,2520.350000,3360.683333,3780.383333,1380.533333,1860.150000,1140.800000
4407,660.016667,2520.000000,3480.283333,2280.200000,3360.633333,660.050000,3480.200000,3360.166667,1080.516667,1680.666667,...,660.600000,600.950000,3600.116667,720.166667,4020.483333,3900.083333,4080.400000,2280.516667,720.600000,780.500000
4408,1620.083333,720.450000,1320.833333,1320.516667,3060.950000,2340.616667,600.633333,3420.050000,780.283333,840.250000,...,3600.666667,3780.550000,660.133333,1200.316667,3060.500000,3900.083333,3840.416667,3780.700000,1500.733333,3900.783333


## Nettoyage du dataset des temps de départ

In [8]:
t_dép_cleaned = t_dép.drop(columns= column_to_drop)
t_dép_cleaned = t_dép_cleaned.drop(columns="Unnamed: 0")
t_dép_cleaned.shape

(4410, 249)

In [9]:
# conversion en date time 
t_dép_cleaned.columns = pd.to_datetime(t_dép_cleaned.columns)
t_dép_converted = t_dép_cleaned.apply(pd.to_datetime)
t_dép_converted = t_dép_converted.apply(lambda col: col.dt.time)

# Appliquer la conversion en secondes sur votre DataFrame
t_dép_seconds = t_dép_converted.applymap(lambda x: time_to_minutes(x))
# Calculer la médiane des secondes pour chaque colonne
medians = t_dép_seconds.median()
# Remplacer les valeurs manquantes par la médiane en secondes
t_dép_seconds_filled = t_dép_seconds.apply(lambda col: col.fillna(medians[col.name]))
# Afficher le DataFrame final avec les valeurs manquantes remplies
t_dép_seconds_filled

  t_dép_seconds = t_dép_converted.applymap(lambda x: time_to_minutes(x))


Unnamed: 0,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-15,2015-01-16,...,2015-12-17,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-28,2015-12-29,2015-12-30,2015-12-31
0,4320.250000,2220.183333,2160.083333,3000.916667,1500.533333,3300.483333,4440.650000,1200.966667,2340.216667,3120.183333,...,2760.566667,2760.700000,1920.833333,2640.850000,3600.733333,3840.366667,1080.116667,2340.500000,3420.933333,2040.550000
1,2400.283333,3900.366667,2880.333333,1560.100000,3060.066667,4080.483333,3180.800000,1080.216667,1860.733333,3420.950000,...,1620.833333,2940.466667,3060.266667,2040.583333,3300.300000,2760.458333,1500.633333,4260.766667,2940.583333,3420.966667
2,4500.233333,1380.766667,3240.533333,2940.350000,2460.366667,4380.500000,2700.900000,2280.416667,2280.483333,2100.216667,...,1380.383333,1140.383333,2220.283333,2880.833333,4500.716667,4440.416667,3540.516667,1560.933333,1380.416667,1920.833333
3,2520.400000,1860.050000,1440.700000,2880.666667,4140.183333,2160.783333,1800.616667,1680.750000,4140.433333,4080.566667,...,2280.950000,4320.383333,3900.150000,2460.000000,3180.583333,3840.350000,2160.566667,4440.266667,3420.183333,1560.233333
4,2940.616667,3960.250000,2580.416667,3240.983333,4560.466667,3660.133333,4140.350000,1920.966667,2340.800000,2760.050000,...,1380.783333,4140.800000,3600.583333,1500.950000,1080.816667,4560.366667,3660.983333,3900.000000,1920.550000,3540.233333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,2640.616667,1620.333333,4080.816667,4500.666667,4500.516667,1440.250000,4500.800000,1680.583333,4020.616667,4080.066667,...,3180.650000,1440.083333,3180.100000,3060.733333,3480.933333,2340.483333,3720.583333,1980.633333,2520.933333,2880.683333
4406,2100.016667,2820.533333,1320.616667,2400.833333,1680.550000,2460.933333,1560.700000,2280.716667,2100.000000,2700.633333,...,3540.833333,2280.033333,2760.233333,3600.983333,3240.983333,3720.250000,2940.566667,3780.033333,1140.283333,2040.650000
4407,2040.583333,1500.116667,2640.766667,2700.366667,1320.416667,1140.950000,3120.750000,1920.866667,1980.883333,1080.616667,...,3540.716667,3900.083333,3600.083333,3840.383333,3600.616667,2220.200000,3600.466667,2820.583333,3900.233333,1560.916667
4408,4020.616667,3360.666667,1200.133333,3240.983333,4380.216667,2820.383333,2220.283333,2580.033333,3060.350000,2580.600000,...,1500.966667,4260.733333,2400.583333,3060.666667,4500.000000,3360.950000,4620.600000,4380.433333,3360.366667,3120.750000


In [10]:
t_durée = t_dép_seconds_filled - t_arr_seconds_filled
t_durée_mean= t_durée.mean(axis=1)
t_durée_mean

0       326.958434
1       369.942303
2       525.524264
3       353.988788
4       345.297523
           ...    
4405    594.468474
4406    318.061379
4407    409.854451
4408    703.031493
4409    537.044980
Length: 4410, dtype: float64

## Nettoyage du dataset des enquêtes sur les employés

In [11]:
emp_survey = emp_survey.fillna(emp_survey.median())

## Concatenation du dataset résultant « t_durée » avec gen_data et les data-set surveys

In [14]:
result = pd.concat([t_durée, gen_data, man_survey, emp_survey], axis=1)
print(result.head())  # Affichage des premières lignes pour vérifier

   2015-01-02 00:00:00  2015-01-05 00:00:00  2015-01-06 00:00:00  \
0          1199.500000          1139.383333         -1620.350000   
1           899.550000          2040.283333           479.950000   
2          2879.550000         -2160.066667          1800.316667   
3          1620.300000         -2040.483333           180.583333   
4           660.333333           479.283333          -660.050000   

   2015-01-07 00:00:00  2015-01-08 00:00:00  2015-01-09 00:00:00  \
0           420.400000         -2099.616667          2160.066667   
1         -1680.183333          1920.000000           960.050000   
2          -420.100000          1679.700000          3479.683333   
3           120.166667          3420.050000          1080.583333   
4          -239.633333          2819.733333          3059.300000   

   2015-01-12 00:00:00  2015-01-13 00:00:00  2015-01-15 00:00:00  \
0          1379.766667          -179.133333          1679.816667   
1          2580.683333         -2100.266667   