In [1]:
import pandas as pd

# Read the train.csv file
train_df = pd.read_csv('train.csv')

# Display first 5 rows
train_df.head()

Unnamed: 0,일자,기온,강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),현지기압(hPa),일조(hr),일사(MJ/m2),지면온도(°C),오 존,이산화질소,일산화탄소,아황산가스,PM10,PM2.5
0,2020-01-01-00,-6.5,0.0,0.0,0.0,38,1.4,1022.6,,,-2.8,,,,,,
1,2020-01-01-01,-5.9,,1.7,50.0,40,1.6,1021.8,,,-2.4,0.002,0.036,0.6,0.003,30.0,16.0
2,2020-01-01-02,-5.7,,0.1,0.0,42,1.7,1021.6,,,-2.4,0.001,0.036,0.6,0.003,34.0,18.0
3,2020-01-01-03,-5.6,0.0,0.0,0.0,46,1.8,1021.5,,,-2.7,0.002,0.038,0.8,0.003,35.0,22.0
4,2020-01-01-04,-5.4,,0.0,0.0,50,2.0,1021.6,,,-2.5,0.001,0.034,0.6,0.003,34.0,19.0


In [2]:
# Count missing values for each column
missing_values = train_df.isnull().sum()

# Display the count of missing values
print("Missing values in each column:")
print(missing_values)

# Calculate percentage of missing values
total_rows = len(train_df)
missing_percentage = (missing_values / total_rows) * 100

# Display percentage of missing values
print("\nPercentage of missing values in each column:")
print(missing_percentage.round(2))

Missing values in each column:
일자               0
기온               1
강수량(mm)      31028
풍속(m/s)         39
풍향(16방위)        39
습도(%)            0
증기압(hPa)         2
현지기압(hPa)       13
일조(hr)       15899
일사(MJ/m2)    15899
지면온도(°C)        15
오 존            423
이산화질소         1603
일산화탄소          579
아황산가스          460
PM10           564
PM2.5          506
dtype: int64

Percentage of missing values in each column:
일자            0.00
기온            0.00
강수량(mm)      88.49
풍속(m/s)       0.11
풍향(16방위)      0.11
습도(%)         0.00
증기압(hPa)      0.01
현지기압(hPa)     0.04
일조(hr)       45.34
일사(MJ/m2)    45.34
지면온도(°C)      0.04
오 존           1.21
이산화질소         4.57
일산화탄소         1.65
아황산가스         1.31
PM10          1.61
PM2.5         1.44
dtype: float64


In [4]:
# Drop the 강수량(mm) column
train_df = train_df.drop('강수량(mm)', axis=1)

# Convert '일자' to datetime for better handling of time series data
train_df['일자'] = pd.to_datetime(train_df['일자'])

# Set '일자' as the index to make interpolation more accurate for time series data
train_df_indexed = train_df.set_index('일자')

# Interpolate missing values in '일조(hr)' and '일사(MJ/m2)'
train_df_indexed['일조(hr)'] = train_df_indexed['일조(hr)'].interpolate(method='time')
train_df_indexed['일사(MJ/m2)'] = train_df_indexed['일사(MJ/m2)'].interpolate(method='time')

# Reset index to get back the '일자' column
train_df = train_df_indexed.reset_index()

# Check the missing values after imputation
missing_after = train_df.isnull().sum()
print("Missing values after processing:")
print(missing_after)

# Display the first few rows to verify changes
train_df.head()

KeyError: "['강수량(mm)'] not found in axis"

In [5]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt

# Convert PM2.5 to numeric
train_df['PM2.5'] = pd.to_numeric(train_df['PM2.5'], errors='coerce')

# Fill remaining NaN values
for column in train_df.columns:
    if train_df[column].dtype != 'datetime64[ns]':
        train_df[column] = train_df[column].fillna(train_df[column].mean())

# Select features for prediction
features = ['기온', '풍속(m/s)', '풍향(16방위)', '습도(%)', '증기압(hPa)', '현지기압(hPa)', 
           '일조(hr)', '일사(MJ/m2)', '지면온도(°C)', 'PM10']
target = 'PM2.5'

# Prepare data for LSTM (create sequences)
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length, :-1])
        y.append(data[i+seq_length, -1])
    return np.array(X), np.array(y)

# Select data and normalize
data = train_df[features + [target]].values
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# Create sequences
seq_length = 24  # 24 hours (assuming hourly data)
X, y = create_sequences(data_scaled, seq_length)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))

# Compile model
model.compile(optimizer='adam', loss='mse')

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train model
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate model
loss = model.evaluate(X_test, y_test)
print(f"Test loss: {loss}")

# Make predictions
y_pred = model.predict(X_test)

# Plot training history
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

# Plot predictions vs actual
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.title('Prediction vs Actual')
plt.xlabel('Actual PM2.5')
plt.ylabel('Predicted PM2.5')
plt.tight_layout()
plt.show()

TypeError: unsupported operand type(s) for +: 'int' and 'str'