In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# 데이터 로드
df = pd.read_excel(r"C:\Users\speec\OneDrive\Desktop\딥테크팁스\tips\output\merged_df_info1.xlsx")

print("===== 데이터 로딩 =====")
print("First few rows of the data:")
display(df.head())
print("\nDataframe info:")
display(df.info())


First few rows of the data:


Unnamed: 0.1,Unnamed: 0,time_block,D_score,final_negative\n_emotion_score,L_influence,issue_\n대중이슈,issue_\n민감이슈,issue_\n불쏘시개이슈,issue_\n진입이슈,delta_log_M\n_norm,W_score\n_norm,log_M\n_norm,Cause,Entity,Event,Impact,Reaction
0,0,2025-04-24 06:00:00,0.50527,0.80673,0.371399,1,1,0,1,0.676837,0.654986,0.274076,1,4,9,0,0
1,1,2025-04-25 06:00:00,0.333333,0.792353,0.356866,1,0,0,1,0.60528,0.583985,0.177319,1,2,4,0,0
2,2,2025-04-25 18:00:00,0.426424,0.999876,0.371176,1,0,0,0,0.52776,0.508918,0.036465,0,1,2,0,0
3,3,2025-04-26 00:00:00,0.413081,0.840789,0.433978,1,0,0,0,0.661289,0.642264,0.233604,0,2,4,0,0
4,4,2025-04-26 06:00:00,0.467193,0.824074,0.503796,1,0,0,0,0.421334,0.402809,0.08886,0,0,2,0,0



Dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   Unnamed: 0                     28 non-null     int64         
 1   time_block                     28 non-null     datetime64[ns]
 2   D_score                        28 non-null     float64       
 3   final_negative
_emotion_score  28 non-null     float64       
 4   L_influence                    28 non-null     float64       
 5   issue_
대중이슈                    28 non-null     int64         
 6   issue_
민감이슈                    28 non-null     int64         
 7   issue_
불쏘시개이슈                  28 non-null     int64         
 8   issue_
진입이슈                    28 non-null     int64         
 9   delta_log_M
_norm              28 non-null     float64       
 10  W_score
_norm                  28 non-null     float64       
 11  log_

None

In [None]:
print("\n===== 데이터 전처리 =====")
# 열 이름 정리
df = df.rename(columns={
    'issue_\n대중이슈': 'issue_public',
    'issue_\n민감이슈': 'issue_sensitive',
    'issue_\n불쏘시개이슈': 'issue_trigger',
    'issue_\n진입이슈': 'issue_entry'
})

# Unnamed 컬럼 제거
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

display(df.head())

Unnamed: 0,time_block,D_score,final_negative\n_emotion_score,L_influence,issue_public,issue_sensitive,issue_trigger,issue_entry,delta_log_M\n_norm,W_score\n_norm,log_M\n_norm,Cause,Entity,Event,Impact,Reaction
0,2025-04-24 06:00:00,0.50527,0.80673,0.371399,1,1,0,1,0.676837,0.654986,0.274076,1,4,9,0,0
1,2025-04-25 06:00:00,0.333333,0.792353,0.356866,1,0,0,1,0.60528,0.583985,0.177319,1,2,4,0,0
2,2025-04-25 18:00:00,0.426424,0.999876,0.371176,1,0,0,0,0.52776,0.508918,0.036465,0,1,2,0,0
3,2025-04-26 00:00:00,0.413081,0.840789,0.433978,1,0,0,0,0.661289,0.642264,0.233604,0,2,4,0,0
4,2025-04-26 06:00:00,0.467193,0.824074,0.503796,1,0,0,0,0.421334,0.402809,0.08886,0,0,2,0,0


In [None]:
print("\n===== 라벨 생성 =====")
# log_M_norm 컬럼 찾기
log_m_col = [c for c in df.columns if 'log_M' in c and 'norm' in c][0]
print(f"라벨 컬럼: {log_m_col}")

# 라벨: 현재 시점 t의 특성으로 다음 시점 t+1의 log_M_norm을 예측
df['label'] = df[log_m_col].shift(-1)  # shift(-1)로 다음 시점 값을 가져옴

# 마지막 행은 다음 시점이 없으므로 제거
df = df[:-1].reset_index(drop=True)

print(f"\n예측 구조:")
print(f"  입력: t시점의 모든 특성들")
print(f"  출력: t+1시점의 {log_m_col}")
print(f"\n데이터 정보:")
print(f"  샘플 수: {len(df)}")
print(f"  입력 특성: {len([c for c in df.columns if c not in ['time_block', log_m_col, 'label']])}")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   time_block                     28 non-null     datetime64[ns]
 1   D_score                        28 non-null     float64       
 2   final_negative
_emotion_score  28 non-null     float64       
 3   L_influence                    28 non-null     float64       
 4   issue_public                   28 non-null     int64         
 5   issue_sensitive                28 non-null     int64         
 6   issue_trigger                  28 non-null     int64         
 7   issue_entry                    28 non-null     int64         
 8   delta_log_M
_norm              28 non-null     float64       
 9   W_score
_norm                  28 non-null     float64       
 10  log_M
_norm                    28 non-null     float64       
 11  Cause                