In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer


In [2]:

# Charger le fichier Excel
file_path = "employes_dataset.xlsx"
df = pd.read_excel(file_path)
df.head()


Unnamed: 0,ID,Nom,Prénom,Âge,Sexe,Email,Pays,Ville,Salaire (€),Date d'embauche,Département,Télétravail (%),Performance (Note)
0,1,Richard,Stacey,62,Homme,lgonzalez@crawford.org,Trinidad and Tobago,New Devinview,54564.0,2023-09-30,Finance,0.0,2.0
1,2,Chang,Loretta,65,Homme,rebeccabrown@hotmail.com,Netherlands,Lake Rebecca,23393.0,2023-10-14,Informatique,0.0,2.0
2,3,Fisher,Kimberly,18,Homme,waynebarnes@schultz-sims.org,Oman,Alexanderfort,62711.0,2017-08-24,Finance,60.0,1.0
3,4,Green,Julie,21,Homme,morgandevon@burgess.com,Moldova,Alexanderville,52274.0,2022-06-09,RH,100.0,1.0
4,5,Dixon,Jacob,21,Homme,elliskaren@gmail.com,Dominican Republic,East Darren,108305.0,2018-08-02,Informatique,0.0,3.0


In [None]:

# Supprimer les lignes avec valeurs manquantes sauf celles de Télétravail (%)
df_clean = df.dropna(subset=[col for col in df.columns if col != "Télétravail (%)"])


In [4]:

# Séparer les données selon la présence ou non de la cible
df_with_target = df_clean[df_clean["Télétravail (%)"].notna()].copy()
df_missing_target = df_clean[df_clean["Télétravail (%)"].isna()].copy()


In [5]:

# Colonnes à exclure
drop_cols = ["ID", "Nom", "Prénom", "Email", "Date d'embauche", "Télétravail (%)"]
X_with = df_with_target.drop(columns=drop_cols)
y_with = df_with_target["Télétravail (%)"]
X_missing = df_missing_target.drop(columns=drop_cols)

# Encodage
X_all = pd.concat([X_with, X_missing], axis=0)
X_all_encoded = pd.get_dummies(X_all)
X_with_encoded = X_all_encoded.iloc[:len(X_with), :]
X_missing_encoded = X_all_encoded.iloc[len(X_with):, :]


In [6]:

imputer = SimpleImputer(strategy="mean")
X_with_encoded = imputer.fit_transform(X_with_encoded)
X_missing_encoded = imputer.transform(X_missing_encoded)

X_train, X_test, y_train, y_test = train_test_split(X_with_encoded, y_with, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_test)
print(f"MAE: {mae}")


MAE: 30.311269937223702


In [7]:

# Prédiction pour les lignes avec valeurs manquantes
y_pred_missing = model.predict(X_missing_encoded)
df_missing_target["Télétravail (%) prédit"] = y_pred_missing

# Ajout des prédictions au set connu
df_with_target["Télétravail (%) prédit"] = model.predict(X_with_encoded)

# Aperçu
df_missing_target.head()


Unnamed: 0,ID,Nom,Prénom,Âge,Sexe,Email,Pays,Ville,Salaire (€),Date d'embauche,Département,Télétravail (%),Performance (Note),Télétravail (%) prédit
7,8,Turner,Stephanie,37,Homme,uphillips@cohen.com,Palau,West Tiffany,25213.0,2024-06-07,Logistique,,4.0,13.796113
25,26,Garcia,Veronica,31,Homme,dunlaphector@yahoo.com,Korea,Douglasport,55990.0,2016-02-07,Logistique,,2.0,65.215745
55,56,Cantu,Christian,68,Femme,qthomas@herrera.com,Finland,Port Walterfort,72078.0,2019-08-11,Informatique,,5.0,79.452942
102,103,Sanchez,Leroy,39,Homme,justinpalmer@yahoo.com,Dominica,Zacharyport,49231.0,2022-07-08,RH,,2.0,17.608872
121,122,Myers,Julie,60,Femme,timothyhuang@guerrero.com,Tonga,East Shawn,72617.0,2018-04-15,Production,,3.0,71.30427
