# k- means 모델링

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
import warnings

warnings.filterwarnings('ignore')

from datetime import datetime

In [2]:
X_=pd.read_csv('./pro_weather.csv')

X_.shape

(69213, 7)

In [3]:
X_['label']

Unnamed: 0,area,row_tmp,top_tmp,av_wsd,ap_tmp,sin,dif
0,속초,15.8,24.8,1.9,22.692741,0.623490,-9.0
1,속초,11.8,20.3,2.8,16.811214,0.609902,-8.5
2,속초,13.7,26.2,2.1,22.103602,0.596132,-12.5
3,속초,13.0,22.4,1.4,17.760007,0.582185,-9.4
4,속초,13.0,21.1,2.5,16.812161,0.568065,-8.1
...,...,...,...,...,...,...,...
69208,남해,15.7,21.9,1.9,20.252709,0.688563,-6.2
69209,남해,14.6,24.1,1.1,20.725819,0.675944,-9.5
69210,남해,14.9,26.7,1.3,22.894020,0.663123,-11.8
69211,남해,14.5,20.8,1.0,19.565000,0.650104,-6.3


In [4]:
X=X_.drop(['area', 'top_tmp', 'row_tmp', 'av_wsd'], axis=1)

In [5]:
X.columns

Index(['ap_tmp', 'sin', 'dif'], dtype='object')

In [6]:
X

Unnamed: 0,ap_tmp,sin,dif
0,22.692741,0.623490,-9.0
1,16.811214,0.609902,-8.5
2,22.103602,0.596132,-12.5
3,17.760007,0.582185,-9.4
4,16.812161,0.568065,-8.1
...,...,...,...
69208,20.252709,0.688563,-6.2
69209,20.725819,0.675944,-9.5
69210,22.894020,0.663123,-11.8
69211,19.565000,0.650104,-6.3


In [8]:
# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = X_scaled

# 데이터
data = X  # 데이터를 로드하거나 생성

# 가중치 설정
weights = [0.8 , 0.1, 0.1]  # 각 열에 대한 가중치 설정

# KMeans 모델 초기화
km_model = KMeans(n_clusters=15, random_state=0)

# 데이터셋에 가중치 적용
weighted_data = data * weights

# 클러스터링 모델 훈련
kmeans_model= km_model.fit(weighted_data)

# 클러스터링 결과 확인
labels = km_model.labels_
centroids = km_model.cluster_centers_

# 결과 출력 또는 저장 등 추가 작업 수행
print("Labels:", labels)
print("Centroids:", centroids)

Labels: [ 0  2  0 ...  0 10 10]
Centroids: [[ 6.12848793e-01  7.07297048e-02 -3.00442963e-02]
 [-8.40972506e-01  3.84998972e-02 -1.39747718e-02]
 [ 1.50929801e-02  1.28370885e-01 -6.23584503e-02]
 [-1.84530260e+00  2.03172748e-02  1.05503565e-02]
 [ 9.32736688e-01 -5.30154577e-02  5.62906907e-02]
 [ 1.16780305e+00 -5.44229113e-02  4.58093829e-02]
 [-2.87225314e-01  1.30481997e-01 -3.42777344e-02]
 [ 2.97107687e-02 -1.20705908e-01 -6.33782816e-03]
 [ 4.23064957e-01 -1.31361035e-01  2.36186477e-02]
 [-5.78294097e-01  3.67032772e-02 -2.99157745e-02]
 [ 3.17372160e-01  1.04152614e-01 -1.59518898e-02]
 [ 7.11572902e-01 -1.03178395e-01  6.27314049e-02]
 [-1.10507323e+00  3.45176480e-02 -5.06624106e-03]
 [-1.41158289e+00  2.81850690e-02  1.80461983e-03]
 [-2.86481273e-01 -1.06775829e-01 -5.07297404e-02]]


In [9]:
import pickle

# K-means 모델 저장
with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(km_model, f)

# 스케일러 저장
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


# 새로운 데이터 예측

In [10]:
def getWeather(input_date):
    import requests
    import json
    from urllib.request import urlopen
    import pandas as pd
    from datetime import date, timedelta, datetime
    date = str(input_date)
    domain = "https://apis.data.go.kr/1360000/VilageFcstInfoService_2.0/getVilageFcst?"
    servicekey = "serviceKey=YlwyrgH2GoTFdpRhqVV1GWd7fsuSnU%2BUuUXMzkrGFZ8O9pfGNUXLgIxVuQ4LiEavT5CsUmjBcDZXTjSO0m0GVA%3D%3D&"
    pageno = "pageNo=1&"
    numofrows = "numOfRows=1000&"
    datatype = "dataType=JSON&"
    base_date = "base_date="+date+"&"
    base_time = "base_time=0500&"
    nx = "nx=55&"
    ny = "ny=127&"
    url = domain + servicekey + pageno + numofrows + datatype + base_date + base_time + nx + ny
    res = requests.get(url)
    items = res.json().get('response').get('body').get('items').get('item')
    tmp = []
    avgt = ['0000', '0300', '0600', '0900', '1200', '1500', '1800', '2100']
    cate = ['TMP', 'WSD']
    for i in range(0, len(items), 12):
        td = dict()
        td['발효날짜'] = items[i].get('fcstDate')
        td['발효시각'] = items[i].get('fcstTime')
        for j in range(i, i+11):
            try:
                if items[j].get('category') == 'TMP':
                    td['기온'] = items[j].get('fcstValue')
                elif items[j].get('category') == 'WSD':
                    td['풍속'] = items[j].get('fcstValue')
            except:
                pass
        tmp.append(td)
    df = pd.DataFrame(tmp)
    df = df.astype({'발효날짜':'datetime64' ,'기온':'float', '풍속':'float'})
    df2 = pd.DataFrame([df.groupby('발효날짜')['기온'].min(), df.groupby('발효날짜')['기온'].max()]).T
    df2.columns = ['최저기온', '최고기온']
    df3 = df[(df['발효시각'] == '0000') | (df['발효시각'] == '0300') | (df['발효시각'] == '0600') | (df['발효시각'] == '0900') |
       (df['발효시각'] == '1200') | (df['발효시각'] == '1500') | (df['발효시각'] == '1800') | (df['발효시각'] == '2100')].groupby('발효날짜')['기온', '풍속'].mean()
    df3.columns = ['평균기온', '평균풍속']
    df4 = pd.concat([df2, df3], axis=1)
    df2 = pd.DataFrame([df.groupby('발효날짜')['기온'].min(), df.groupby('발효날짜')['기온'].max()]).T
    df2.columns = ['최저기온', '최고기온']
    df3 = df[(df['발효시각'] == '0000') | (df['발효시각'] == '0300') | (df['발효시각'] == '0600') | (df['발효시각'] == '0900') |
   (df['발효시각'] == '1200') | (df['발효시각'] == '1500') | (df['발효시각'] == '1800') | (df['발효시각'] == '2100')].groupby('발효날짜')['기온', '풍속'].mean()
    df3.columns = ['평균기온', '평균풍속']
    df4 = pd.concat([df2, df3], axis=1).reset_index()
    
    # 
    X_new= df4[:3]
    X_new['ap_tmp'] = 13.12 + 0.6215 * X_new['평균기온'] - 11.37 * X_new['평균풍속']**0.16 + 0.3965 * X_new['평균기온'] * X_new['평균풍속']**0.16
    
    import pandas as pd
    from datetime import datetime
    import numpy as np
    
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 12, 31)
    
    # 날짜 범위 생성
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    # x 값 범위 설정
    x = np.linspace(0, len(date_range), len(date_range))
    # y 값 계산
    y = np.sin((2 * np.pi * x) / len(date_range))
    # 데이터프레임 생성
    df = pd.DataFrame({'x': date_range, 'y': y})
    
    X_new['sin'] = None
    
    # 각 행을 순회하며 조건과 비교하여 열 추가
    for index, row in X_new.iterrows():
        mask = df['x'] == row['발효날짜']
        if any(mask):  # 조건에 맞는 행이 하나 이상 있는지 확인
            X_new.loc[index, 'sin'] = df.loc[mask, 'y'].values[0]
        else:
            pass
        
    X_new['dif'] = X_new['최저기온'] - X_new['최고기온']
    X_new.drop(['발효날짜', '최저기온', '최고기온', '평균기온', '평균풍속'], axis=1,inplace=True)
    return X_new

In [11]:
# 오늘 데이터 넣어야함
X_now = getWeather(20230608)

In [12]:
X_now

Unnamed: 0,ap_tmp,sin,dif
0,21.753116,0.402527,-5.0
1,22.594508,0.386667,-8.0
2,22.992714,0.370691,-9.0


In [13]:
# K-means 모델 로드
with open('kmeans_model.pkl', 'rb') as f:
    km_model = pickle.load(f)

# 스케일러 로드
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [16]:
# 예측에 사용할 새로운 데이터 준비 (입력 데이터에 따라 적절하게 구성)
new_data = X_now

# 데이터 스케일링
scaled_data = scaler.transform(new_data)

# K-means 모델을 사용하여 예측 수행
cluster_labels = km_model.predict(scaled_data)

# 예측된 클러스터 레이블 출력
print(cluster_labels)

[11  0  0]
