In [2]:
!pip install tsfresh



PREPROCESSING

In [4]:
# 패키지 로드
import pandas as pd
import numpy as np
import random
import time
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
from tsfresh import extract_features, select_features
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight

# 시드 설정
SEED = 1234

# 시간 측정 시작
start = time.time()

# 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# 시계열 데이터 하나의 행의 결측치를 주기 기반으로 보간하는 함수 정의
def periodic_interpolation(series, period):
    interpolated_series = series.copy()
    for i in range(len(series)):
        if np.isnan(series[i]):
            previous_values = []
            for j in range(i % period, len(series), period):
                if not np.isnan(series[j]):
                    previous_values.append(series[j])
            if previous_values:
                interpolated_series[i] = np.mean(previous_values)
    return interpolated_series

# 데이터프레임의 모든 시계열 데이터에 대해 주기 기반 보간을 적용하는 함수
def apply_periodic_interpolation(df):
    for idx in df.index:
        first_time_series = df.iloc[idx, 1:501].values

        # 주기 감지
        peaks, _ = find_peaks(first_time_series, distance=100)
        periods = np.diff(peaks)
        if len(periods) > 0:
            period = int(np.median(periods))  # 주기 길이를 중앙값으로 설정

            # 주기 기반 보간 적용
            interpolated_series = periodic_interpolation(first_time_series, period)

            # 보간된 결과를 원래 데이터프레임에 반영
            df.iloc[idx, 1:501] = interpolated_series
        else:
            print(f"Warning: No peaks detected for row {idx}. Skipping interpolation.")
    return df

# 결측치 보간 함수 적용
train_indexed = train.iloc[:, 1:501]
test_indexed = test.iloc[:, 1:501]
train_nomissing = apply_periodic_interpolation(train_indexed)
test_nomissing = apply_periodic_interpolation(test_indexed)

# tsfresh를 사용한 특징 추출
time_series_data = train_nomissing.copy().reset_index()
time_series_data = pd.melt(time_series_data, id_vars=['index'], var_name='time', value_name='value')
time_series_data.rename(columns={'index': 'id'}, inplace=True)

# 특징 추출
features = extract_features(time_series_data, column_id='id', column_sort='time')
fe1 = features
fe2 = fe1.dropna(axis=1)

# 라벨과 특징 데이터 준비
X = fe2
y = train['Label']

# 같은 방법으로 test_nomissing 의 특징 추출을 위한 전처리 적용
time_series_test = test_nomissing.copy().reset_index()
time_series_test = pd.melt(time_series_test, id_vars=['index'], var_name='time', value_name='value')
time_series_test.rename(columns={'index': 'id'}, inplace=True)

# 특징 추출
features = extract_features(time_series_test, column_id='id', column_sort='time')
fe1t = features
fe2t = fe1t.dropna(axis=1)

# TSfresh를 활용한 변수 선택
relevant_features = select_features(fe2, y, multiclass=True, n_significant=3, ml_task='classification')
X = fe2[relevant_features.columns]

# 범주형 컬럼 인코딩 함수 정의
def encode_data(df):
    bins = [1990, 2000, 2010, 2020]
    df['Year_encoded'] = np.digitize(df['Year'], bins=bins) - 1

    df['Country'] = df['Country'].replace({
        '중국': 'CHN', '美国': 'USA', 'china': 'CHN', '中国': 'CHN', 'Korea': 'KOR',
        'america': 'USA', '미국': 'USA', 'U.S.': 'USA', '대한민국': 'KOR', '韩国': 'KOR',
        'South Korea': 'KOR', '한국': 'KOR'
    })
    country_mapping = {'KOR': 0, 'CHN': 1, 'USA': 2}
    df['Country_encoded'] = df['Country'].map(country_mapping)

    df['S/N_encoded'] = df['S/N'].str[:4].apply(lambda x: 0 if x == 'PSCG' else (1 if x == 'PSFT' else -1))

    return df[['Year_encoded', 'Country_encoded', 'S/N_encoded']]

train_encoded = encode_data(train)
test_encoded = encode_data(test)

X = pd.concat([X, train_encoded], axis=1)

# 수치형 컬럼과 범주형 컬럼 나누기
def identify_feature_types(df, threshold=3):
    categorical_features = []
    continuous_features = []
    for col in df.columns:
        if df[col].nunique() <= threshold:
            categorical_features.append(col)
        else:
            continuous_features.append(col)
    return categorical_features, continuous_features

categorical_features, continuous_features = identify_feature_types(X)

Feature Extraction: 100%|██████████| 13000/13000 [1:34:35<00:00,  2.29it/s]
Feature Extraction: 100%|██████████| 2000/2000 [14:12<00:00,  2.34it/s]


KeyError: 'Year'

MODELING(OPTUNA)

In [13]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.2 colorlog-6.8.2 optuna-3.6.1


In [24]:
# 데이터 분할
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

# RobustScaler 적용
scaler = RobustScaler()
x_train_scale = scaler.fit_transform(x_train[continuous_features])
x_val_scale = scaler.transform(x_val[continuous_features])

X_continuous_df = pd.DataFrame(x_train_scale, index=x_train.index, columns=continuous_features)
x_val_continuous_df = pd.DataFrame(x_val_scale, index=x_val.index, columns=continuous_features)

# 최종 데이터셋
train_set = pd.concat([x_train[categorical_features], X_continuous_df], axis=1)
test_set = pd.concat([x_val[categorical_features], x_val_continuous_df], axis=1)

# 모델 학습
import optuna
from xgboost import XGBClassifier
def objective(trial):
    param = {
        'verbosity': 1,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'alpha': trial.suggest_float('alpha', 0, 10),
        'lambda': trial.suggest_float('lambda', 0, 10)
    }

    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    model = XGBClassifier(**param, random_state=SEED)
    model.fit(train_set, y_train, eval_set=[(test_set, y_val)], early_stopping_rounds=10, sample_weight=sample_weights, verbose=False)

    preds = model.predict(test_set)
    acc = np.mean(preds == y_val)

    return acc

# 옵튜나 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best trial:")
trial = study.best_trial

print(f"  Accuracy: {trial.value}")
print("  Best hyperparameters: ", trial.params)

# 베스트 하이퍼파라미터
best_params = trial.params
model = XGBClassifier(**best_params, eval_metric='mlogloss', random_state=SEED)
middle = time.time()

[I 2024-07-19 11:31:43,765] A new study created in memory with name: no-name-41e2398b-a21b-456a-95c8-6bfdee066b09
[I 2024-07-19 11:33:19,746] Trial 0 finished with value: 0.9834615384615385 and parameters: {'n_estimators': 416, 'learning_rate': 0.08437782175702015, 'max_depth': 7, 'colsample_bytree': 0.9383018053724173, 'subsample': 0.9083475006942705, 'alpha': 0.8424939450365576, 'lambda': 5.2405600831397905}. Best is trial 0 with value: 0.9834615384615385.
[I 2024-07-19 11:33:54,968] Trial 1 finished with value: 0.9746153846153847 and parameters: {'n_estimators': 260, 'learning_rate': 0.1293825381073609, 'max_depth': 4, 'colsample_bytree': 0.9131327278088507, 'subsample': 0.550548939744332, 'alpha': 7.358254749087157, 'lambda': 8.75715031805688}. Best is trial 0 with value: 0.9834615384615385.
[I 2024-07-19 11:34:52,316] Trial 2 finished with value: 0.9773076923076923 and parameters: {'n_estimators': 980, 'learning_rate': 0.12293378279989135, 'max_depth': 10, 'colsample_bytree': 0.93

Best trial:
  Accuracy: 0.9876923076923076
  Best hyperparameters:  {'n_estimators': 433, 'learning_rate': 0.17544298336548947, 'max_depth': 3, 'colsample_bytree': 0.7979479822008998, 'subsample': 0.6814118250794118, 'alpha': 0.07158768442748387, 'lambda': 2.8854317762003943}


In [25]:
best_params

{'n_estimators': 433,
 'learning_rate': 0.17544298336548947,
 'max_depth': 3,
 'colsample_bytree': 0.7979479822008998,
 'subsample': 0.6814118250794118,
 'alpha': 0.07158768442748387,
 'lambda': 2.8854317762003943}

MODELING WITH BEST PARAMETER

In [26]:
best_params = {'n_estimators': 433,
               'learning_rate': 0.17544298336548947,
               'max_depth': 3,
               'colsample_bytree': 0.7979479822008998,
               'subsample': 0.6814118250794118,
               'alpha': 0.07158768442748387,
               'lambda': 2.8854317762003943}

model = XGBClassifier(**best_params, eval_metric='mlogloss', random_state=SEED)

fe2t_t = fe2t[relevant_features.columns]
t = pd.concat([fe2t_t, test_encoded], axis=1)

scaler = RobustScaler()
x_train_scale = scaler.fit_transform(X[continuous_features])
x_test_scale = scaler.transform(t[continuous_features])

X_continuous_df = pd.DataFrame(x_train_scale, index=X.index, columns=continuous_features)
x_test_continuous_df = pd.DataFrame(x_test_scale, index=t.index, columns=continuous_features)

train_set = pd.concat([X[categorical_features], X_continuous_df], axis=1)
test_set = pd.concat([t[categorical_features], x_test_continuous_df], axis=1)

sample_weights = compute_sample_weight(class_weight='balanced', y=y)
model.fit(train_set, y, sample_weight=sample_weights)

PREDICT

In [27]:
# 예측
y_test_pred = model.predict(test_set)
y_test_pred_series = pd.Series(y_test_pred)

# 제출 파일 생성
submission_df = sample_submission.copy()
submission_df['Label'] = y_test_pred_series
submission_df.to_csv('submission_result_xgb_fffinal_optuna.csv', index=False)

# 전체 시간 측정 종료
end = time.time()

# 시간 출력
print(f"전체 실행 시간: {end - start} 초")
print(f"모델 학습 시간: {middle - start} 초")
print(f"예측 및 파일 생성 시간: {end - middle} 초")

전체 실행 시간: 8914.377143144608 초
모델 학습 시간: 8685.92897939682 초
예측 및 파일 생성 시간: 228.44816374778748 초
