### Import & Data Load

In [None]:
# pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-win_amd64.whl.metadata (14 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp310-cp310-win_amd64.whl (10.7 MB)
   ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
   ------- -------------------------------- 2.1/10.7 MB 11.8 MB/s eta 0:00:01
   ----------------- ---------------------- 4.7/10.7 MB 11.9 MB/s eta 0:00:01
   --------------------------- ------------ 7.3/10.7 MB 11.9 MB/s eta 0:00:01
   ------------------------------------ --- 9.7/10.7 MB 11.9 MB/s eta 0:00:01
   ---------------------------------------  10.5/10.7 MB 11.9 MB/s eta 0:00:01
   ---------------------------------------- 10.7/10.7 MB 10.1 MB/s eta 0:00:00
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached

In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_csv('D:\\workspace_1\\dacon_venv\\test\\dataset\\train.csv')
test = pd.read_csv('D:\\workspace_1\\dacon_venv\\test\\dataset\\test.csv')

In [None]:
train.count()

ID                          3000
gender                      3000
age                         3000
height                      3000
weight                      3000
cholesterol                 3000
systolic_blood_pressure     3000
diastolic_blood_pressure    3000
glucose                     3000
bone_density                3000
activity                    3000
smoke_status                3000
medical_history             3000
family_medical_history      3000
sleep_pattern               3000
edu_level                   3000
mean_working                3000
stress_score                3000
dtype: int64

### Check Data

In [None]:
train.isnull().sum()
#데이터 3000개중에 1486개의 데이터를 최빈값으로 대체하게 되면 문제가 생길것이다. 
#가설1 : edu_level은 결측치의 값이 많지는 않기에 최빈값으로 대체한다.
#가설2 : family_medical_history,medical_history 의 경우 null값을 0이나 다른 값으로 대체한다.


ID                             0
gender                         0
age                            0
height                         0
weight                         0
cholesterol                    0
systolic_blood_pressure        0
diastolic_blood_pressure       0
glucose                        0
bone_density                   0
activity                       0
smoke_status                   0
medical_history             1289
family_medical_history      1486
sleep_pattern                  0
edu_level                    607
mean_working                1032
stress_score                   0
dtype: int64

In [None]:
# 결측값 있는 칼럼(column) 확인
missing_columns_train = train.columns[train.isnull().sum() > 0]
missing_columns_train

Index(['medical_history', 'family_medical_history', 'edu_level',
       'mean_working'],
      dtype='object')

In [None]:
train[missing_columns_train].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   medical_history         1711 non-null   object 
 1   family_medical_history  1514 non-null   object 
 2   edu_level               2393 non-null   object 
 3   mean_working            1968 non-null   float64
dtypes: float64(1), object(3)
memory usage: 93.9+ KB


In [None]:
categorical_na_cols = []
numerical_na_cols = []

# 결측값이 있는 각 칼럼에 대해 데이터 타입 확인
for col in missing_columns_train:
    # 해당 칼럼이 범주형(object 또는 category)인지 확인
    if col in train.select_dtypes(include=['object', 'category']).columns:
        categorical_na_cols.append(col)
    # 해당 칼럼이 수치형(int 또는 float)인지 확인
    elif col in train.select_dtypes(include=['int', 'float']).columns:
        numerical_na_cols.append(col)

print("결측값이 있는 범주형 변수:", categorical_na_cols)
print("결측값이 있는 수치형 변수:", numerical_na_cols)

결측값이 있는 범주형 변수: ['medical_history', 'family_medical_history', 'edu_level']
결측값이 있는 수치형 변수: ['mean_working']


### Data Preprocessing

In [None]:
# 범주형 변수의 결측값을 최빈값으로 대체
for col in categorical_na_cols:
    # 학습 데이터에서 최빈값 계산
    most_frequent = train[col].mode()[0]
    
    # 학습 데이터와 테스트 데이터 모두 해당 칼럼의 최빈값으로 대체
    train[col] = train[col].fillna(most_frequent)
    test[col] = test[col].fillna(most_frequent)

In [None]:
# mean_working에 대해 중앙값 대체
median_value = train['mean_working'].median()

train['mean_working'] = train['mean_working'].fillna(median_value)
test['mean_working'] = test['mean_working'].fillna(median_value)

In [None]:
# Label Encoding 적용 열 - 범주형 데이터
categorical_cols = train.select_dtypes(include='object').columns.drop('ID')

for feature in categorical_cols:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])
    
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test[feature] = le.transform(test[feature])

In [None]:
x_train = train.drop(['ID', 'stress_score'], axis = 1)
y_train = train['stress_score']

test = test.drop('ID', axis = 1)

### Train / Predict

In [None]:
model = LGBMRegressor(random_state = 42)
model.fit(x_train, y_train)

pred = model.predict(test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1454
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 16
[LightGBM] [Info] Start training from score 0.482130


### Submission

In [None]:
submission = pd.read_csv(r'D:\workspace_1\dacon_venv\test\dataset\sample_submission.csv')

In [None]:
submission['stress_score'] = pred
submission.head()

Unnamed: 0,ID,stress_score
0,TEST_0000,0.412394
1,TEST_0001,0.689009
2,TEST_0002,0.183309
3,TEST_0003,0.522015
4,TEST_0004,0.572112


In [None]:
submission.to_csv('submit.csv', index=False)