In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [4]:
# CSV 파일 읽기
passenger_df = pd.read_csv('서울교통공사_승하차_인원수20221231.csv', encoding='euc-kr')

# "승차총승객수"와 "하차총승객수"를 더하여 "총 승객수" 컬럼 생성
passenger_df['총 승객수'] = passenger_df['승차총승객수'] + passenger_df['하차총승객수']

# "역명"별로 "총 승객수" 집계
total_passengers_by_station = passenger_df.groupby('역명')['총 승객수'].sum().reset_index()

# 결과 출력
total_passengers_by_station

Unnamed: 0,역명,총 승객수
0,4.19민주묘지,3988
1,가능,9038
2,가락시장,22177
3,가산디지털단지,37006
4,가양,24940
...,...,...
516,회기,34879
517,회룡,17056
518,회현(남대문시장),39353
519,효창공원앞,10751


In [6]:
import pandas as pd

# CSV 파일 읽기
passenger_df = pd.read_csv('서울교통공사_승하차_인원수20221231.csv', encoding='euc-kr')
air_quality_df = pd.read_csv('서울교통공사_지하역사_공기질_측정_정보_20221231_cleaned_v2', encoding='euc-kr')

# "승차총승객수"와 "하차총승객수"를 더하여 "총 승객수" 컬럼 생성
passenger_df['총 승객수'] = passenger_df['승차총승객수'] + passenger_df['하차총승객수']

# "역명"별로 "총 승객수" 집계
total_passengers_by_station = passenger_df.groupby('역명')['총 승객수'].sum().reset_index()

# 두 데이터프레임에서 "역명" 값 추출
passenger_stations = set(total_passengers_by_station['역명'])
air_quality_stations = set(air_quality_df['역명'])

# 일치하지 않는 "역명" 값 확인
non_matching_stations = passenger_stations.symmetric_difference(air_quality_stations)

# 결과 출력
print("일치하지 않는 역명 값:", non_matching_stations)

FileNotFoundError: [Errno 2] No such file or directory: '서울교통공사_지하역사_공기질_측정_정보_20221231_cleaned_v2'

In [3]:
# 데이터 로드 및 전처리
air_quality_df = pd.read_csv('서울교통공사_지하역사_공기질_측정_정보_20221231.csv', encoding='euc-kr')
congestion_df = pd.read_csv('서울교통공사_지하철혼잡도정보_20230331.csv', encoding='euc-kr')

# 데이터 병합
merged_df = pd.merge(air_quality_df, congestion_df, on=['역명', '날짜', '시간'])

KeyError: '역명'

In [None]:
# 가중치 설정
def assign_weight(row):
    weight = 1
    if row['미세먼지'] > 100:
        weight *= 1.5  # 기준치 초과
    elif row['미세먼지'] > 85:
        weight *= 1.2  # 위험
    if row['혼잡도'] > 80:
        weight *= 1.3  # 혼잡도가 높은 시간
    return weight

merged_df['가중치'] = merged_df.apply(assign_weight, axis=1)

# 독립 변수와 종속 변수 설정
X = merged_df[['역명', '시간', '혼잡도']]
y = merged_df['미세먼지']

# 데이터 분할
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(X, y, merged_df['가중치'], test_size=0.2, random_state=42)

# 회귀 모델 구축
model = LinearRegression()
model.fit(X_train, y_train, sample_weight=weights_train)

# 예측 및 평가
y_pred = model.predict(X_test)
print('R2 Score:', r2_score(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

# 예측 결과 시각화
plt.scatter(y_test, y_pred)
plt.xlabel('Actual PM10')
plt.ylabel('Predicted PM10')
plt.title('Actual vs Predicted PM10')
plt.show()