In [235]:
import os
import pandas as pd
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


def load_data(file):
    return pd.read_csv(file)

path = "datas/"
general_data = load_data(path +"general_data.csv")
employee_survey_data = load_data(path +"employee_survey_data.csv")
manager_survey_data = load_data(path +"manager_survey_data.csv")
in_time = load_data(path +"in_time.csv")
out_time = load_data(path +"out_time.csv")
general_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

In [161]:
def to_datetime(strdate):
    if type(strdate) == str:
        return datetime.strptime(strdate, "%Y-%m-%d %H:%M:%S").timestamp()
    else:
        return 0

def get_working_time(in_time_data, out_time_data):
    out_time_df = (
        out_time_data
        .iloc[:,1:]
        .applymap(lambda x: to_datetime(x))
    )
    in_time_df = (
        in_time_data
        .iloc[:,1:]
        .applymap(lambda x: to_datetime(x))
    )
    hours_per_day = out_time_df - in_time_df
    in_time_data["MeanWorkingHours"] = hours_per_day.mean(axis = 1)/3600
    return in_time_data[["EmployeeID", "MeanWorkingHours"]]

In [162]:
class MergeDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, employee_survey_data, manager_survey_data, in_time, out_time):
        self.employee_survey_data = employee_survey_data
        self.manager_survey_data = manager_survey_data
        self.in_time= in_time
        self.out_time = out_time
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.join(self.employee_survey_data.set_index("EmployeeID"), on="EmployeeID")
        X = X.join(self.manager_survey_data.set_index("EmployeeID"), on="EmployeeID")
        X = pd.merge(X, get_working_time(in_time, out_time), on='EmployeeID', how='inner')

        return X

In [163]:
class DeleteUnEthicColumn(BaseEstimator, TransformerMixin):
    def __init__(self, array):
        self.array = array

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for i in self.array:
            X.drop(i, axis=1, inplace=True)
        return X


In [224]:
class OrgDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X["Attrition"] = X["Attrition"].replace(to_replace=['No', 'Yes'], value=[-1, 1])
        X["BusinessTravel"] = X["BusinessTravel"].replace(to_replace=['Non-Travel', 'Travel_Rarely', 'Travel_Frequently'], value=[1, 2, 3])
        return X


In [219]:
'''class OneHotDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        one_hot_encoder = OneHotEncoder()
        object_attribs= X.select_dtypes(include=["object"]).keys()

        encoded_datas_array = one_hot_encoder.fit_transform(object_attribs).toarray()
        encoded_datas_labels = one_hot_encoder.categories_
        encoded_datas_labels = np.hstack([
        encoded_data = pd.DataFrame(encoded_datas_array, columns=encoded_datas_labels)
        #encoded_data["EmployeeID"] = datas["EmployeeID"]


        X = X.join(encoded_data)
        X.drop(['EducationField', 'Department', "JobRole"], axis=1, inplace=True, errors="ignore")'''




'class OneHotDataFrame(BaseEstimator, TransformerMixin):\n    def fit(self, X, y=None):\n        return self\n\n    def transform(self, X, y=None):\n        one_hot_encoder = OneHotEncoder()\n        object_attribs= X.select_dtypes(include=["object"]).keys()\n\n        encoded_datas_array = one_hot_encoder.fit_transform(object_attribs).toarray()\n        encoded_datas_labels = one_hot_encoder.categories_\n        encoded_datas_labels = np.hstack([\n        encoded_data = pd.DataFrame(encoded_datas_array, columns=encoded_datas_labels)\n        #encoded_data["EmployeeID"] = datas["EmployeeID"]\n\n\n        X = X.join(encoded_data)\n        X.drop([\'EducationField\', \'Department\', "JobRole"], axis=1, inplace=True, errors="ignore")'

In [220]:
class ScaleData(BaseEstimator, TransformerMixin):
    def __init__(self, array):
        self.array = array

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        std_scale = StandardScaler().fit()
        X_scaled = pd.DataFrame(std_scale.transform(X), columns=X.columns)
        return X_scaled

In [236]:
def build_pipeline(X):

    prepa_pipeline= Pipeline([
        ("merge", MergeDataFrame(load_data(path +"employee_survey_data.csv"),
                                 load_data(path +"manager_survey_data.csv"),
                                 load_data(path +"in_time.csv"),
                                 load_data(path +"out_time.csv"))),
        ("delete", DeleteUnEthicColumn([
                                        "Age",
                                        "Over18",
                                        "Gender",
                                        "MaritalStatus",
                                        "EmployeeCount",
                                        "StandardHours",
                                        "EmployeeID"
        ]))])

    data_pipeline = prepa_pipeline.fit_transform(X)
    print(data_pipeline.info())
    
    object_attribs= data_pipeline.select_dtypes(include=["object"]).keys()
    num_search = data_pipeline.select_dtypes(include=[np.number])
    num_attribs = list(num_search)
    


    labels = data_pipeline.keys().to_list()
    labels.remove('Attrition')


    #label_predict= Pipeline(steps=[
    #    ('label', OrgDataFrame())
    #])
    
    onehot_pipeline = Pipeline(steps=[
        ('label', OrgDataFrame()),
        ('onehot',OneHotEncoder())
    ])

    scale_pipeline = Pipeline(steps=[
        ('std_scaler', StandardScaler()),
    ])

    pipeline = ColumnTransformer(transformers=[
        ("numerisation",scale_pipeline,num_attribs),
        ("one", onehot_pipeline, object_attribs) 
        ],
        remainder='drop',
        n_jobs=-1)

    full_pipeline = Pipeline(steps=[
        ("pipeline", pipeline),
    ])

    from sklearn import set_config
    set_config(display='diagram')
    display(full_pipeline)
    
    data_fullpipeline = full_pipeline.fit_transform(data_pipeline)
    



    #X = data_pipeline[labels]
    #y = data_pipeline['Attrition']
    
    return data_fullpipeline

In [237]:
data=build_pipeline(general_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Attrition                4410 non-null   object 
 1   BusinessTravel           4410 non-null   object 
 2   Department               4410 non-null   object 
 3   DistanceFromHome         4410 non-null   int64  
 4   Education                4410 non-null   int64  
 5   EducationField           4410 non-null   object 
 6   JobLevel                 4410 non-null   int64  
 7   JobRole                  4410 non-null   object 
 8   MonthlyIncome            4410 non-null   int64  
 9   NumCompaniesWorked       4391 non-null   float64
 10  PercentSalaryHike        4410 non-null   int64  
 11  StockOptionLevel         4410 non-null   int64  
 12  TotalWorkingYears        4401 non-null   float64
 13  TrainingTimesLastYear    4410 non-null   int64  
 14  YearsAtCompany          

In [234]:
data

array([[-0.39393818, -0.89168825, -0.96148639, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.09963874, -1.86842575, -0.96148639, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.96339837,  1.06178675,  1.74961015, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.95055222, -0.89168825, -0.05778755, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.0867926 , -0.89168825, -0.96148639, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.32073492,  0.08504925, -0.05778755, ...,  0.        ,
         0.        ,  0.        ]])