In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [3]:
# 데이터 로드 및 전처리
air_quality_df = pd.read_csv('서울교통공사_지하역사_공기질_측정_정보_20221231.csv')
congestion_df = pd.read_csv('서울교통공사_지하철혼잡도정보_20230331.csv')

# 데이터 병합
merged_df = pd.merge(air_quality_df, congestion_df, on=['역명', '날짜', '시간'])

KeyError: '역명'

In [None]:
# 가중치 설정
def assign_weight(row):
    weight = 1
    if row['미세먼지'] > 100:
        weight *= 1.5  # 기준치 초과
    elif row['미세먼지'] > 85:
        weight *= 1.2  # 위험
    if row['혼잡도'] > 80:
        weight *= 1.3  # 혼잡도가 높은 시간
    return weight

merged_df['가중치'] = merged_df.apply(assign_weight, axis=1)

# 독립 변수와 종속 변수 설정
X = merged_df[['역명', '시간', '혼잡도']]
y = merged_df['미세먼지']

# 데이터 분할
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(X, y, merged_df['가중치'], test_size=0.2, random_state=42)

# 회귀 모델 구축
model = LinearRegression()
model.fit(X_train, y_train, sample_weight=weights_train)

# 예측 및 평가
y_pred = model.predict(X_test)
print('R2 Score:', r2_score(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

# 예측 결과 시각화
plt.scatter(y_test, y_pred)
plt.xlabel('Actual PM10')
plt.ylabel('Predicted PM10')
plt.title('Actual vs Predicted PM10')
plt.show()