In [1]:
import pandas as pd
import utils
# Read the CSV file from Google Drive
filename = 'Data//train_data.csv'
df = pd.read_csv(filename)

In [2]:
# Print the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3156459 entries, 0 to 3156458
Data columns (total 16 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Unnamed: 0          int64  
 1   fog_train.year      object 
 2   fog_train.month     int64  
 3   fog_train.day       int64  
 4   fog_train.time      int64  
 5   fog_train.minute    int64  
 6   fog_train.stn_id    object 
 7   fog_train.ws10_deg  float64
 8   fog_train.ws10_ms   float64
 9   fog_train.ta        float64
 10  fog_train.re        float64
 11  fog_train.hm        float64
 12  fog_train.sun10     float64
 13  fog_train.ts        float64
 14  fog_train.vis1      int64  
 15  fog_train.class     int64  
dtypes: float64(7), int64(7), object(2)
memory usage: 385.3+ MB
None


In [3]:
# 각 클래스별로 10% 샘플링
sampled_df = df.groupby('fog_train.class', group_keys=False).apply(lambda x: x.sample(frac=0.1))

# 샘플링된 데이터 저장
sampled_df.to_csv('Data//sampled_train.csv', index=False)

In [4]:
# 결과 확인
print(sampled_df)

         Unnamed: 0 fog_train.year  fog_train.month  fog_train.day  \
2416057     2416058              K               11             30   
973500       973501              I                6             21   
1496797     1496798              J                6              4   
1508825     1508826              J                8             27   
2921209     2921210              K                7             11   
...             ...            ...              ...            ...   
764084       764085              I                7              1   
1086465     1086466              J                8             14   
3101402     3101403              K               12             14   
2491981     2491982              K                5             11   
2151049     2151050              K               11             14   

         fog_train.time  fog_train.minute fog_train.stn_id  \
2416057               7                40               AF   
973500               13            

In [5]:
# 데이터 프레임 정보 확인
sampled_df_info = sampled_df.info()

# 데이터 통계 요약
sampled_df_description = sampled_df.describe()

print(sampled_df_info, sampled_df_description)

<class 'pandas.core.frame.DataFrame'>
Index: 315647 entries, 2416057 to 2151049
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          315647 non-null  int64  
 1   fog_train.year      315647 non-null  object 
 2   fog_train.month     315647 non-null  int64  
 3   fog_train.day       315647 non-null  int64  
 4   fog_train.time      315647 non-null  int64  
 5   fog_train.minute    315647 non-null  int64  
 6   fog_train.stn_id    315647 non-null  object 
 7   fog_train.ws10_deg  315647 non-null  float64
 8   fog_train.ws10_ms   315647 non-null  float64
 9   fog_train.ta        315647 non-null  float64
 10  fog_train.re        315647 non-null  float64
 11  fog_train.hm        315647 non-null  float64
 12  fog_train.sun10     315647 non-null  float64
 13  fog_train.ts        315647 non-null  float64
 14  fog_train.vis1      315647 non-null  int64  
 15  fog_train.class     315647 non-n

In [6]:
# 'Unnamed: 0' 열과 'fog_train.vis1' 열 삭제 -> vis1을 예측하는 class 문제로 바꾸기 -> 오히려 class 데이터를 없애야한다.
sampled_df_cleaned = sampled_df.drop(columns=['Unnamed: 0','fog_train.vis1','fog_train.ws10_deg'])
# fog_train.ws10_deg 너무 낮아서 일단 없애보기

# 결과 확인
# print(sampled_df_cleaned)

In [7]:
# 이상치 처리
cleaned_data = utils.handle_outliers(sampled_df_cleaned)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column].replace(-99.9, np.nan, inplace=True)


In [8]:
# 결과 확인
cleaned_data_info = cleaned_data.info()
cleaned_data_description = cleaned_data.describe()

print(cleaned_data_info, cleaned_data_description)

<class 'pandas.core.frame.DataFrame'>
Index: 299517 entries, 1678781 to 2151049
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fog_train.year      299517 non-null  object 
 1   fog_train.month     299517 non-null  int64  
 2   fog_train.day       299517 non-null  int64  
 3   fog_train.time      299517 non-null  int64  
 4   fog_train.minute    299517 non-null  int64  
 5   fog_train.stn_id    299517 non-null  object 
 6   fog_train.ws10_deg  299517 non-null  float64
 7   fog_train.ws10_ms   299517 non-null  float64
 8   fog_train.ta        299517 non-null  float64
 9   fog_train.re        299517 non-null  float64
 10  fog_train.hm        299517 non-null  float64
 11  fog_train.sun10     299517 non-null  float64
 12  fog_train.ts        299517 non-null  float64
 13  fog_train.class     299517 non-null  int64  
dtypes: float64(7), int64(5), object(2)
memory usage: 34.3+ MB
None        fog_train.mo

In [9]:
# 'fog_train.year' 열의 값을 연도 값으로 변환
cleaned_data['fog_train.year'] = utils.convert_year(cleaned_data['fog_train.year'])
cleaned_data['fog_train.year'] = cleaned_data['fog_train.year'].astype(int)

In [10]:
# 'fog_train.month'를 기반으로 계절 열 추가
cleaned_data['season'] = cleaned_data['fog_train.month'].apply(utils.get_season)

In [11]:
# 'fog_train.day'를 카테고리로 변환하여 새로운 열 추가
cleaned_data['day_category'] = cleaned_data['fog_train.day'].apply(utils.categorize_day)

In [12]:
# 측정소 ID의 앞자리와 뒷자리를 분리하여 새로운 열 추가
cleaned_data['region'] = cleaned_data['fog_train.stn_id'].str[0]
# cleaned_data['station'] = cleaned_data['fog_train.stn_id'].str[1]

In [13]:
# 'fog_train.year', 'fog_train.month', 'fog_train.day', 'fog_train.stn_id' 열 삭제
cleaned_data.drop(columns=['fog_train.day', 'fog_train.stn_id','fog_train.minute'], inplace=True)

In [14]:
# 결과 확인
cleaned_data.info()
cleaned_data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 299517 entries, 1678781 to 2151049
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fog_train.year      299517 non-null  int32  
 1   fog_train.month     299517 non-null  int64  
 2   fog_train.time      299517 non-null  int64  
 3   fog_train.ws10_deg  299517 non-null  float64
 4   fog_train.ws10_ms   299517 non-null  float64
 5   fog_train.ta        299517 non-null  float64
 6   fog_train.re        299517 non-null  float64
 7   fog_train.hm        299517 non-null  float64
 8   fog_train.sun10     299517 non-null  float64
 9   fog_train.ts        299517 non-null  float64
 10  fog_train.class     299517 non-null  int64  
 11  season              299517 non-null  object 
 12  day_category        299517 non-null  object 
 13  region              299517 non-null  object 
dtypes: float64(7), int32(1), int64(3), object(3)
memory usage: 33.1+ MB


Unnamed: 0,fog_train.year,fog_train.month,fog_train.time,fog_train.ws10_deg,fog_train.ws10_ms,fog_train.ta,fog_train.re,fog_train.hm,fog_train.sun10,fog_train.ts,fog_train.class,season,day_category,region
1678781,2021,11,8,338.3,1.1,3.2,0.0,96.3,0.02,4.0,1,Fall,Mid,B
1472243,2021,12,1,238.05,0.0,3.3,0.0,93.0,0.0,1.8,1,Winter,Mid,A
775022,2020,9,4,137.8,1.3,20.2,0.0,88.7,0.0,18.8,1,Fall,Mid,D
757448,2020,5,3,248.5,1.9,14.8,0.0,96.8,0.0,14.9,1,Spring,Mid,D
789218,2020,12,18,195.6,1.8,4.4,0.0,58.2,0.0,-0.8,1,Winter,Late,D


In [15]:
# 원-핫 인코딩
X_ohe_encoded = cleaned_data.copy()
X_ohe_encoded = pd.get_dummies(X_ohe_encoded, columns=['fog_train.year','region', 'season', 'day_category','fog_train.re'])

X_ohe_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299517 entries, 1678781 to 2151049
Data columns (total 26 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fog_train.month      299517 non-null  int64  
 1   fog_train.time       299517 non-null  int64  
 2   fog_train.ws10_deg   299517 non-null  float64
 3   fog_train.ws10_ms    299517 non-null  float64
 4   fog_train.ta         299517 non-null  float64
 5   fog_train.hm         299517 non-null  float64
 6   fog_train.sun10      299517 non-null  float64
 7   fog_train.ts         299517 non-null  float64
 8   fog_train.class      299517 non-null  int64  
 9   fog_train.year_2020  299517 non-null  bool   
 10  fog_train.year_2021  299517 non-null  bool   
 11  fog_train.year_2022  299517 non-null  bool   
 12  region_A             299517 non-null  bool   
 13  region_B             299517 non-null  bool   
 14  region_C             299517 non-null  bool   
 15  region_D       

In [16]:
from sklearn.preprocessing import LabelEncoder

X_label_encoded= cleaned_data.copy()

# 레이블 인코딩
label_encoders = {}
for column in ['fog_train.year','region', 'season', 'day_category']:
    le = LabelEncoder()
    X_label_encoded[column] = le.fit_transform(X_label_encoded[column])
    label_encoders[column] = le

X_label_encoded.head()

Unnamed: 0,fog_train.year,fog_train.month,fog_train.time,fog_train.ws10_deg,fog_train.ws10_ms,fog_train.ta,fog_train.re,fog_train.hm,fog_train.sun10,fog_train.ts,fog_train.class,season,day_category,region
1678781,1,11,8,338.3,1.1,3.2,0.0,96.3,0.02,4.0,1,0,2,1
1472243,1,12,1,238.05,0.0,3.3,0.0,93.0,0.0,1.8,1,3,2,0
775022,0,9,4,137.8,1.3,20.2,0.0,88.7,0.0,18.8,1,0,2,3
757448,0,5,3,248.5,1.9,14.8,0.0,96.8,0.0,14.9,1,1,2,3
789218,0,12,18,195.6,1.8,4.4,0.0,58.2,0.0,-0.8,1,3,1,3


# PCA 적용 

In [None]:
# 레이블 데이터로 설정
X = X_label_encoded.drop(columns=['fog_train.class'])
y = X_label_encoded['fog_train.class']-1

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# PCA 모델 초기화 (설명 분산 비율의 95%를 유지하도록)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
X_pca = pd.DataFrame(X_pca, index=X_label_encoded.index)

# PCA 결과
print(f"Original number of features: {X_label_encoded.shape[1]}")
print(f"Reduced number of features: {X_pca.shape[1]}")

# 주성분이 설명하는 분산 비율
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by each component: {explained_variance}")
print(f"Cumulative explained variance: {np.cumsum(explained_variance)}")

In [None]:
import matplotlib.pyplot as plt

# 주성분이 설명하는 분산 비율 시각화
plt.figure(figsize=(10, 5))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1, len(np.cumsum(explained_variance)) + 1), np.cumsum(explained_variance), where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [17]:
# 샘플링된 데이터 저장
X_ohe_encoded.to_csv('Data//X_ohe_encoded.csv', index=False)
X_label_encoded.to_csv('Data//X_label_encoded.csv', index=False)
X_pca.to_csv('Data//X_pca.csv', index=False)