In [1]:
import xarray as xr
import pathlib
import sys
import datetime as dt
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pickle

ROOT_PATH = pathlib.Path().resolve().parent
ROOT_PATH

sys.path.append(str(ROOT_PATH))

from modules.date_range import date_range

In [2]:
start = dt.date(2022,1,1)
end = dt.date(2023,1,1)
datasets = [xr.open_dataset(date.strftime(f"{ROOT_PATH}/data/%Y/%m/%Y_%m%d.nc"))[['psea', 'sp', 'u', 'v', 'temp', 'rh', 'r1h', 'dswrf','ncld']].dropna(dim="time") for date in date_range(start,end)]
# dataset = xr.open_dataset(start.strftime(f"{ROOT_PATH}/data/%Y/%m/%Y_%m%d.nc"))[['psea', 'sp', 'u', 'v', 'temp', 'rh', 'r1h', 'dswrf','ncld']].dropna(dim="time")
del start,end

In [3]:
combined_dataset = xr.concat(datasets,dim="time")

del datasets

In [4]:
# 特徴量とターゲット変数に分割する
X = combined_dataset[['psea', 'sp', 'u', 'v', 'temp', 'rh', 'r1h', 'dswrf']]
y = combined_dataset['ncld']

In [5]:
X_df = X.to_dataframe().rename_axis(['time', 'lat', 'lon']).reset_index()
del X

In [6]:
y_df = y.to_dataframe().rename_axis(['time', 'lat', 'lon']).reset_index()
del y

In [7]:
amemaster = pd.read_csv(ROOT_PATH/"ame_master_20230323.csv")
amemaster_kan = amemaster[(amemaster["種類"] == "官")].drop_duplicates(subset="観測所番号")

X_df_ = pd.DataFrame()
y_df_ = pd.DataFrame()

for _,i in amemaster_kan.iterrows():
    lon = i["経度(度)"] + i["経度(分)"] / 60
    lat = i["緯度(度)"] + i["緯度(分)"] / 60
    # print(f"\r{lon},{lat}",end="")
    tmpdf = X_df[(round(X_df["lon"],2) == round(int(lon / 0.0625) * 0.0625, 2))
                &
                (round(X_df["lat"],2) == round(int(lat / 0.04999977) * 0.04999977,2))
                ]
    
    tmp_y_df = y_df[(round(y_df["lon"],2) == round(int(lon / 0.0625) * 0.0625, 2))
                &
                (round(y_df["lat"],2) == round(int(lat / 0.04999977) * 0.04999977,2))
                ]
    # display(tmpdf)
    X_df_ = pd.concat([X_df_.reset_index(drop=True),tmpdf.reset_index(drop=True)],axis=0)
    y_df_ = pd.concat([y_df_.reset_index(drop=True),tmp_y_df.reset_index(drop=True)],axis=0)

del X_df,y_df

: 

: 

In [48]:
X_df_

Unnamed: 0,time,lat,lon,psea,sp,u,v,temp,rh,r1h,dswrf
0,2022-01-01 00:00:00,45.400002,141.6250,100777.064114,100564.219806,4.483181,-1.828746,266.750890,55.862386,200.000000,665.0000
1,2022-01-01 01:00:00,45.400002,141.6250,100679.816409,100466.054670,5.981652,2.648318,267.025307,91.713302,0.740055,121.1350
2,2022-01-01 02:00:00,45.400002,141.6250,100576.146687,100360.550085,1.113150,-14.477065,270.252969,60.800459,1.272165,102.2340
3,2022-01-01 03:00:00,45.400002,141.6250,100761.009068,100545.871182,-0.036697,-13.749236,269.241548,62.678900,200.000000,665.0000
4,2022-01-01 04:00:00,45.400002,141.6250,100885.320993,100665.137235,-2.048930,-12.079511,268.828616,64.220184,-0.000007,334.1710
...,...,...,...,...,...,...,...,...,...,...,...
19,2022-01-01 19:00:00,24.350000,124.1875,102166.972345,101967.889505,3.107034,1.082569,293.468613,78.325688,-0.000007,0.0005
20,2022-01-01 20:00:00,24.350000,124.1875,102115.137484,101911.008772,3.449541,0.831804,293.544404,76.435780,-0.000007,0.0005
21,2022-01-01 21:00:00,24.350000,124.1875,102097.706292,101900.917029,4.140673,-1.076453,293.784845,75.683486,200.000000,665.0000
22,2022-01-01 22:00:00,24.350000,124.1875,102138.990694,101943.118863,3.418960,-3.443425,293.986084,78.692660,0.006110,0.0005


In [8]:
# データセットをトレーニングセットとテストセットに分割する
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3)

del X_df,y_df

In [None]:
# LightGBMデータセットに変換する
lgb_train = lgb.Dataset(X_train, label=y_train)

del X_train,y_train

In [19]:
params = {
    'objective': 'regression',
    'metric': 'mse'
}

# モデルのトレーニング
model = lgb.train(params, lgb_train)

: 

: 

In [None]:
filename = "model_using_msm.pkl"
pickle.dump(model,open(ROOT_PATH/filename),"wb")

In [None]:
import xarray as xr
import pathlib
import sys
import datetime as dt
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pickle

ROOT_PATH = pathlib.Path(__file__).parents[1]
ROOT_PATH

sys.path.append(str(ROOT_PATH))

from modules.date_range import date_range

print("start")
start = dt.date(2022,1,1)
end = dt.date(2023,1,1)

amemaster = pd.read_csv(ROOT_PATH/"ame_master_20230323.csv")
amemaster_kan = amemaster[(amemaster["種類"] == "官")].drop_duplicates(subset="観測所番号")

datasets = [xr.open_dataset(date.strftime(f"{ROOT_PATH}/data/%Y/%m/%Y_%m%d.nc"))[['psea', 'sp', 'u', 'v', 'temp', 'rh', 'r1h', 'dswrf','ncld']].dropna(dim="time") for date in date_range(start,end)]
del start,end
print("Complete import datasets")

combined_dataset = xr.concat(datasets,dim="time")

del datasets
print("complete combine datasets")


# 特徴量とターゲット変数に分割する
X = combined_dataset[['psea', 'sp', 'u', 'v', 'temp', 'rh', 'r1h', 'dswrf']]
y = combined_dataset['ncld']

del combined_dataset


X_df = X.to_dataframe().rename_axis(['time', 'lat', 'lon']).reset_index()
del X
print("Complete to dataframe X")

y_df = y.to_dataframe().rename_axis(['time', 'lat', 'lon']).reset_index()
del y
print("complete to dataframe y")

X_df_ = pd.DataFrame()
y_df_ = pd.DataFrame()

for _,i in amemaster_kan.iterrows():
    lon = i["経度(度)"] + i["経度(分)"] / 60
    lat = i["緯度(度)"] + i["緯度(分)"] / 60
    # print(f"\r{lon},{lat}",end="")
    tmpdf = X_df[(round(X_df["lon"],2) == round(int(lon / 0.0625) * 0.0625, 2))
                &
                (round(X_df["lat"],2) == round(int(lat / 0.04999977) * 0.04999977,2))
                ]
    
    tmp_y_df = y_df[(round(y_df["lon"],2) == round(int(lon / 0.0625) * 0.0625, 2))
                &
                (round(y_df["lat"],2) == round(int(lat / 0.04999977) * 0.04999977,2))
                ]
    # display(tmpdf)
    X_df_ = pd.concat([X_df_.reset_index(drop=True),tmpdf.reset_index(drop=True)],axis=0)
    y_df_ = pd.concat([y_df_.reset_index(drop=True),tmp_y_df.reset_index(drop=True)],axis=0)
    del tmpdf,tmp_y_df

del X_df,y_df
print("Complete extract df")

# データセットをトレーニングセットとテストセットに分割する
X_train, X_test, y_train, y_test = train_test_split(X_df_, y_df_, test_size=0.3)

del X_df_,y_df_
print("Complete split data")

# LightGBMデータセットに変換する
lgb_train = lgb.Dataset(X_train, label=y_train)

del X_train,y_train


params = {
    'objective': 'regression',
    'metric': 'mse'
}

# モデルのトレーニング
model = lgb.train(params, lgb_train)


filename = "model_using_msm1.pkl"
with open(ROOT_PATH/filename,"wb") as f:
    pickle.dump(model,f)