<a href="https://colab.research.google.com/github/Snargol/projet-IA-CESI/blob/main/Projet_IA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import os
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/Snargol/projet-IA-CESI/main/"
DATA_PATH = os.path.join("datasets", "data")

In [11]:
def fetch_data(data_url=DOWNLOAD_ROOT,data_path = DATA_PATH):
  if not os.path.isdir(data_path):
        os.makedirs(data_path)
  employee_url = data_url+"employee_survey_data_full.csv"
  manager_url =  data_url +"manager_survey_data_full.csv"
  general_url =  data_url +"general_data_full.csv"

  employee_path = os.path.join(data_path, "employee_survey_data_full.csv")
  manager_path = os.path.join(data_path, "manager_survey_data_full.csv")
  general_path = os.path.join(data_path, "general_data_full.csv")
  urllib.request.urlretrieve(employee_url, employee_path)
  urllib.request.urlretrieve(manager_url, manager_path)
  urllib.request.urlretrieve(general_url, general_path)

fetch_data()

In [13]:
def load_final_data():
  general_data = load_data("general_data_full")
  employee_survey_data = load_data("employee_survey_data_full")
  manager_survey_data = load_data("manager_survey_data_full")
  temp_result = pd.merge(general_data,employee_survey_data,on='EmployeeID')
  result = pd.merge(temp_result,manager_survey_data,on='EmployeeID')
  return result

columns_to_fill = ['NumCompaniesWorked','TotalWorkingYears','EnvironmentSatisfaction','JobSatisfaction','WorkLifeBalance',]
binary_columns = ['Attrition','Gender']
nominal_columns = ['Department','EducationField','JobRole','MaritalStatus']
ordinal_columns =  [
    {
        "label":'BusinessTravel',
        "order":['Non-Travel','Travel_Rarely','Travel_Frequently']
    }
]


data = load_final_data()

In [14]:
# Supprime les colonnes dont l'écart-type vaut 0 (une donnée unique pour toutes les lignes de la colonne)
def del_std_of_0(_data):
    _deleted_columns = []
    for each in _data.describe().columns :
      if _data.describe()[str(each)]['std'] == 0:
        _data = _data.drop(columns=[each])
        _deleted_columns.append([each])
    print("Deleted Columns : ")
    print(_deleted_columns)
    return _data
  
data = del_std_of_0(data.copy())

def check_contains_nan(_data):
  count_nan = 0
  for each in columns_to_fill:
    temp_count_nan = count_nan + _data[each].isna().sum()
    count_nan= temp_count_nan
  return count_nan

check_contains_nan(data)

def fill_data(_data_to_process) :

  imputer = SimpleImputer(strategy="median")
  
  _data_to_process_index = data_num.index
  _data_to_process_labels = data_num.columns


  imputer.fit(_data_to_process)
  data_filled_temp = imputer.transform(_data_to_process)
  data_filled = pd.DataFrame(data_filled_temp,columns=_data_to_process_labels)
  return data_filled


def delete_nan_row(_data_to_process):
  for each in columns_to_fill:
    _data_to_process.drop(_data_to_process[_data_to_process[each].isna()].index,inplace=True)
  return _data_to_process

data_num = data.select_dtypes(include=[np.number])
data_num = fill_data(data_num)

Deleted Columns : 
[['EmployeeCount'], ['StandardHours']]


In [25]:
# we concat the data
data_cat = data.select_dtypes(object)
dataset = pd.concat([data_num, data_cat], axis=1)

dataset.head()

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,...,JobInvolvement,PerformanceRating,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18
0,51.0,6.0,2.0,1.0,1.0,131160.0,1.0,11.0,0.0,1.0,...,3.0,3.0,No,Travel_Rarely,Sales,Life Sciences,Female,Healthcare Representative,Married,Y
1,31.0,10.0,1.0,2.0,1.0,41890.0,0.0,23.0,1.0,6.0,...,2.0,4.0,Yes,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Single,Y
2,32.0,17.0,4.0,3.0,4.0,193280.0,1.0,15.0,3.0,5.0,...,3.0,3.0,No,Travel_Frequently,Research & Development,Other,Male,Sales Executive,Married,Y
3,38.0,2.0,5.0,4.0,3.0,83210.0,3.0,11.0,3.0,13.0,...,2.0,3.0,No,Non-Travel,Research & Development,Life Sciences,Male,Human Resources,Married,Y
4,32.0,10.0,1.0,5.0,1.0,23420.0,4.0,12.0,2.0,9.0,...,3.0,3.0,No,Travel_Rarely,Research & Development,Medical,Male,Sales Executive,Single,Y
