In [1]:
# 데이터 불러오기
import pandas as pd
import numpy as np

file_path = '../Data/HR_Employee.csv'
data = pd.read_csv(file_path)


In [2]:
# class 불균형 해결 - oversampling

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# 범주형 변수 One-Hot Encoding
X = pd.get_dummies(data.drop(columns=['Attrition']), drop_first=True)

y = data['Attrition']

# Label Encoding
le = LabelEncoder()
y = le.fit_transform(y)

# SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [3]:
X_resampled

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1102,1,2,1,1,2,94,3,2,...,False,False,False,False,False,True,False,False,True,True
1,49,279,8,1,1,2,3,61,2,2,...,False,False,False,False,True,False,False,True,False,False
2,37,1373,2,2,1,4,4,92,2,1,...,True,False,False,False,False,False,False,False,True,True
3,33,1392,3,4,1,5,4,56,3,1,...,False,False,False,False,True,False,False,True,False,True
4,27,591,2,1,1,7,1,40,3,1,...,True,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2461,31,759,10,3,1,1035,3,69,3,1,...,False,False,False,False,False,False,False,False,False,False
2462,40,1070,17,3,1,393,1,68,3,1,...,False,False,False,False,False,False,False,False,False,True
2463,30,608,17,3,1,398,1,66,1,1,...,False,False,False,False,False,False,False,True,False,True
2464,21,574,5,1,1,1630,2,35,2,1,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# 이상치 처리 - MonthlyIncome, 격차를 줄이기 위해서 log 함수 활용

data['MonthlyIncome_Log'] = np.log1p(data['MonthlyIncome'])

In [5]:
# 이상치 처리하여 변수명이 변경되었으므로 불필요한 변수 제거 
## data_cleaned 
columns_to_drop = ['MonthlyIncome']
data_cleaned = data.drop(columns=columns_to_drop)

In [6]:
data_cleaned.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'MonthlyIncome_Log'],
      dtype='object')

In [7]:
# 이상치 처리 - NumCompaniesWorked, TrainingTimesLastYear, YearsSinceLastPromotion, YearsWithCurrManager
variables_to_process = ['NumCompaniesWorked', 'TrainingTimesLastYear', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

for feature in variables_to_process:
    q1 = data[feature].quantile(0.25)
    q3 = data[feature].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    data_cleaned = data_cleaned[(data_cleaned[feature] >= lower_bound) & (data_cleaned[feature] <= upper_bound)]


In [8]:
data_cleaned.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'MonthlyIncome_Log'],
      dtype='object')

In [9]:
# 불필요한 열 제거
data_cleaned = data_cleaned.drop(['EmployeeNumber', 'EmployeeCount', 'StandardHours'], axis=1)

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

In [11]:
# 수치형/범주형 변수 구분
numerical_features = data_cleaned.select_dtypes(include=['int64', 'float64']).columns
categorical_features = data_cleaned.select_dtypes(include=['object']).columns

In [12]:
# 범주형 변수와 attrition 간의 카이제곱 검정
for feature in categorical_features:
    contingency_table = pd.crosstab(data_cleaned[feature], data_cleaned['Attrition'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)

In [13]:
# Feature Engineering

# YearsWithOtherCompanies: 다른 회사에서 근무한 연수
# TotalWorkingYears + YearsAtCompany
data_cleaned['YearsWithOtherCompanies'] = data_cleaned['TotalWorkingYears'] - data_cleaned['YearsAtCompany']
data_cleaned['YearsWithOtherCompanies'] = data_cleaned['YearsWithOtherCompanies'].apply(lambda x: max(x, 0))  # 음수 값은 0으로 처리

# AgeAtJoining: 입사 당시 나이
## Age + YearsAtCompany
data_cleaned['AgeAtJoining'] = data_cleaned['Age'] - data_cleaned['YearsAtCompany']
data_cleaned['AgeAtJoining'] = data_cleaned['AgeAtJoining'].apply(lambda x: max(x, 0))  # 음수 값은 0으로 처리

# IncomePerYearWorked: 연차당 소득
## MonthlyIncome_Log + TotalWorkingYears
data_cleaned['IncomePerYearWorked'] = data_cleaned['MonthlyIncome_Log'] / (data_cleaned['TotalWorkingYears'] + 1)  # +1을 더해 0으로 나누는 것을 방지

In [14]:
# 새로운 features들에 대한 이상치 제거 - IQR

new_features = ['YearsWithOtherCompanies', 'AgeAtJoining', 'IncomePerYearWorked']

for feature in new_features:
    q1 = data_cleaned[feature].quantile(0.25)
    q3 = data_cleaned[feature].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    data_cleaned = data_cleaned[(data_cleaned[feature] >= lower_bound) & (data_cleaned[feature] <= upper_bound)]


In [15]:
# 수치형 변수 Scaling - 표준화
from sklearn.preprocessing import StandardScaler

numerical_features = data_cleaned.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data_cleaned[numerical_features] = scaler.fit_transform(data_cleaned[numerical_features])

In [16]:
# Feature Selection - 상관계수 기반(수치형 data), 카이제곱 검정 기반(범주형 data)

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 수치형 변수 상관계수 기반 제거
numerical_features = data_cleaned.select_dtypes(include=['int64', 'float64']).columns
data_numerical = data_cleaned[numerical_features]

corr_matrix = data_numerical.corr()

high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j]))

print("Highly correlated pairs:")
print(high_corr_pairs)

## 상관계수가 높은 변수 중 하나 제거 (두 번째 변수를 제거)
columns_to_drop = []
for pair in high_corr_pairs:
    columns_to_drop.append(pair[1])

## 중복 제거 및 최종 제거할 수치형 변수 목록 생성
columns_to_drop = list(set(columns_to_drop))

# 카이제곱 검정 기반 범주형 변수 제거
categorical_features = data_cleaned.select_dtypes(include=['object']).columns
categorical_features = categorical_features.drop('Attrition') 

le = LabelEncoder()
y_encoded = le.fit_transform(data_cleaned['Attrition'])

from scipy.stats import chi2_contingency

chi2_insignificant_vars = []

for feature in categorical_features:
    contingency_table = pd.crosstab(data_cleaned[feature], y_encoded)
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    if p > 0.05:
        chi2_insignificant_vars.append(feature)

print("Insignificant categorical variables based on chi-square test:")
print(chi2_insignificant_vars)

## 최종 제거할 변수 목록에 추가 
columns_to_drop.extend(chi2_insignificant_vars)
columns_to_drop = list(set(columns_to_drop))  # 중복 제거

# 최종 변수 제거
X_selected = data_cleaned.drop(columns=columns_to_drop).drop(columns=['Attrition'])  # 타겟 변수는 제외


Highly correlated pairs:
[('MonthlyIncome_Log', 'JobLevel')]
Insignificant categorical variables based on chi-square test:
['Department', 'EducationField', 'Gender', 'Over18']


In [23]:
X_selected.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement',
       'JobSatisfaction', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'MonthlyIncome_Log',
       'YearsWithOtherCompanies', 'AgeAtJoining', 'IncomePerYearWorked',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Married', 'MaritalStatus_Single', 'OverTime_Yes'],
      dtype='object')

In [24]:
y_encoded[:5]

array([0, 1, 0, 0, 0])

In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [18]:
X_selected = pd.get_dummies(X_selected, drop_first=True)

In [19]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size = 0.2, random_state=0)

# # 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
X_train_scaled.shape

(712, 38)

In [21]:
# PyTorch 데이터셋으로 변환
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test)