### 天气预测
LightGBM回归模型

In [2]:
# import based lib
import pandas as pd
import numpy as np

In [3]:
# read csv and delete RainTomorrow rows with empty
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

print("train size:",train.shape)
print("test size:", test.shape)

train size: (101822, 24)
test size: (43638, 23)


In [4]:
import pandas as pd

def missing_value_percent(train):
    missing_values_count = train.isnull().sum()
    total_values = train.shape[0]
    missing_values_percentage = (missing_values_count / total_values) * 100
    print(missing_values_percentage)

missing_value_percent(train)

id                0.000000
Date              0.000000
Location          0.000000
MinTemp           1.014516
MaxTemp           0.872110
Rainfall          2.203846
Evaporation      43.002495
Sunshine         47.851152
WindGustDir       7.054468
WindGustSpeed     7.011255
WindDir9am        7.280352
WindDir3pm        2.898195
WindSpeed9am      1.203080
WindSpeed3pm      2.107600
Humidity9am       1.830646
Humidity3pm       3.132918
Pressure9am      10.340594
Pressure3pm      10.321934
Cloud9am         38.320795
Cloud3pm         40.732848
Temp9am           1.222722
Temp3pm           2.516156
RainToday         2.203846
RainTomorrow      2.250005
dtype: float64


In [5]:
# missing number value 
columns_miss = [
    "WindGustSpeed",
    "WindSpeed9am",
    "WindSpeed3pm",
    "Humidity9am",
    "Humidity3pm",
    "Pressure9am",
    "Pressure3pm",
    "Temp9am",
    "Temp3pm",
    "MinTemp",
    "MaxTemp",
    "Rainfall"
]

for column in columns_miss:
    train[column] = train[column].interpolate(method="linear")

missing_value_percent(train)

id                0.000000
Date              0.000000
Location          0.000000
MinTemp           0.000000
MaxTemp           0.000000
Rainfall          0.000000
Evaporation      43.002495
Sunshine         47.851152
WindGustDir       7.054468
WindGustSpeed     0.000000
WindDir9am        7.280352
WindDir3pm        2.898195
WindSpeed9am      0.000000
WindSpeed3pm      0.000000
Humidity9am       0.000000
Humidity3pm       0.000000
Pressure9am       0.000000
Pressure3pm       0.000000
Cloud9am         38.320795
Cloud3pm         40.732848
Temp9am           0.000000
Temp3pm           0.000000
RainToday         2.203846
RainTomorrow      2.250005
dtype: float64


In [6]:
columns_miss_object = ["RainToday", "RainTomorrow"]

for column in columns_miss_object:
    train[column] = train[column].ffill().bfill()

missing_value_percent(train)

id                0.000000
Date              0.000000
Location          0.000000
MinTemp           0.000000
MaxTemp           0.000000
Rainfall          0.000000
Evaporation      43.002495
Sunshine         47.851152
WindGustDir       7.054468
WindGustSpeed     0.000000
WindDir9am        7.280352
WindDir3pm        2.898195
WindSpeed9am      0.000000
WindSpeed3pm      0.000000
Humidity9am       0.000000
Humidity3pm       0.000000
Pressure9am       0.000000
Pressure3pm       0.000000
Cloud9am         38.320795
Cloud3pm         40.732848
Temp9am           0.000000
Temp3pm           0.000000
RainToday         0.000000
RainTomorrow      0.000000
dtype: float64


In [7]:
# union dataset, cherk object and null data
xy_all = pd.concat([train, test], axis=0)
cat_features = [
    "Date",
    "Location",
    
    "WindGustDir",
    "WindDir9am",
    "WindDir3pm",

    "RainToday",
    "RainTomorrow",

    "Evaporation",
    "Sunshine",
    "Cloud9am",
    "Cloud3pm",
]

print("features:",cat_features, "\n length:",len(cat_features))
print("xy_all size:", xy_all.shape)

features: ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow', 'Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'] 
 length: 11
xy_all size: (145460, 24)


In [8]:
# processing value
# object(string) type to in32, NAN and missing value to -1
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(
    dtype=np.int64,
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-1
).set_output(transform="pandas")

xy_all[cat_features] = ordinal_encoder.fit_transform(xy_all[cat_features])
print("xy_all size:", xy_all.shape)

xy_all size: (145460, 24)


In [9]:
# No = 0 Yes = 1 y_train = -1
xy_all

Unnamed: 0,id,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,1,164,2,7.4,25.1,0.0,-1,-1,14,44.0,...,44.0,25.0,1010.6,1007.8,-1,-1,17.2,24.3,0,0
1,2,175,2,12.9,25.7,0.0,-1,-1,15,46.0,...,38.0,30.0,1007.6,1008.7,-1,2,21.0,23.2,0,0
2,3,178,2,9.2,28.0,0.0,-1,-1,4,24.0,...,45.0,16.0,1017.6,1012.8,-1,-1,18.1,26.5,0,0
3,4,179,2,17.5,32.3,1.0,-1,-1,13,41.0,...,82.0,33.0,1010.8,1006.0,7,8,17.8,29.7,0,0
4,5,180,2,14.6,29.7,0.2,-1,-1,14,56.0,...,55.0,23.0,1009.2,1005.4,-1,-1,20.6,28.9,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43633,43634,1079,21,1.0,17.8,0.0,22,102,11,35.0,...,74.0,27.0,1028.4,1026.3,0,1,9.9,17.2,0,-1
43634,43635,1655,3,18.4,38.7,0.0,98,113,3,37.0,...,20.0,14.0,1016.9,1013.2,0,4,30.4,38.4,0,-1
43635,43636,1777,8,13.1,25.6,0.0,64,109,9,37.0,...,59.0,46.0,1020.3,1016.5,2,2,19.4,24.2,0,-1
43636,43637,2385,29,10.1,22.4,,-1,19,15,37.0,...,93.0,66.0,1019.6,1017.7,8,3,15.2,20.6,-1,-1


In [10]:
# cherk type
print(xy_all.dtypes)
# xy_all

id                 int64
Date               int64
Location           int64
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation        int64
Sunshine           int64
WindGustDir        int64
WindGustSpeed    float64
WindDir9am         int64
WindDir3pm         int64
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am           int64
Cloud3pm           int64
Temp9am          float64
Temp3pm          float64
RainToday          int64
RainTomorrow       int64
dtype: object


In [11]:
# Split
# RainTomorrow with -1 is x_test
# RainTomorrow without -1 is xy_train

xy_train = xy_all[xy_all["RainTomorrow"] != -1]
x_train = xy_train.drop(columns=["RainTomorrow"])
y_train = xy_train["RainTomorrow"]
x_test = xy_all[xy_all["RainTomorrow"] == -1].drop(columns="RainTomorrow")
print("xy_all size:", xy_all.shape)
print("x_train size:", x_train.shape)
print("y_train size:", y_train.shape)
print("x_test size:", x_test.shape)

xy_all size: (145460, 24)
x_train size: (101822, 23)
y_train size: (101822,)
x_test size: (43638, 23)


In [12]:
# cherk data is right

# x_train
# x_test
# y_train

In [13]:
# import lightgbm model
import lightgbm as lgb
model = lgb.LGBMRegressor(
    boosting_type="gbdt",
    num_leaves=74,
    max_depth=-1,
    learning_rate=0.09794344673376235,
    n_estimators=380,
    subsample_for_bin=200000,
    objective=None,
    class_weight=None,
    min_split_gain=0.0,
    min_child_weight=0.001,
    min_child_samples=67,
    subsample=0.8447134830762376,
    subsample_freq=0,
    colsample_bytree=0.9337489793588183,
    reg_alpha=0.8447134830762376,
    reg_lambda=0.8447134830762376,
    random_state=42,
    n_jobs=None,
    importance_type="split",
)

In [14]:
# train
model.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3787
[LightGBM] [Info] Number of data points in the train set: 101822, number of used features: 23
[LightGBM] [Info] Start training from score 0.225973


In [15]:
# predict
y_pred = model.predict(x_test)
y_pred_str = np.where(y_pred > 0.5, "Yes", "No")

In [16]:
# save csv
pd.DataFrame({
    "id": x_test["id"],
    "RainTomorrow": y_pred_str
}).to_csv("./output/LGBM.csv", index=False)