In [1]:
# import lib
import pandas as pd
import numpy as np

# read csv and delete RainTomorrow rows with empty
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [2]:
import pandas as pd

def missing_value_percent(train):
    missing_values_count = train.isnull().sum()
    total_values = train.shape[0]
    missing_values_percentage = (missing_values_count / total_values) * 100
    print(missing_values_percentage)

missing_value_percent(train)

id                0.000000
Date              0.000000
Location          0.000000
MinTemp           1.014516
MaxTemp           0.872110
Rainfall          2.203846
Evaporation      43.002495
Sunshine         47.851152
WindGustDir       7.054468
WindGustSpeed     7.011255
WindDir9am        7.280352
WindDir3pm        2.898195
WindSpeed9am      1.203080
WindSpeed3pm      2.107600
Humidity9am       1.830646
Humidity3pm       3.132918
Pressure9am      10.340594
Pressure3pm      10.321934
Cloud9am         38.320795
Cloud3pm         40.732848
Temp9am           1.222722
Temp3pm           2.516156
RainToday         2.203846
RainTomorrow      2.250005
dtype: float64


In [3]:
# columns_miss = [
#     "WindGustSpeed",
#     "WindSpeed9am",
#     "WindSpeed3pm",
#     "Humidity9am",
#     "Humidity3pm",
#     "Pressure9am",
#     "Pressure3pm",
#     "Temp9am",
#     "Temp3pm",
#     "MinTemp",
#     "MaxTemp",
#     "Rainfall"
# ]

# for column in columns_miss:
#     train[column] = train[column].interpolate(method="linear")

In [4]:
columns_miss_object = ["RainToday", "RainTomorrow"]

for column in columns_miss_object:
    train[column] = train[column].ffill().bfill()

missing_value_percent(train)

id                0.000000
Date              0.000000
Location          0.000000
MinTemp           1.014516
MaxTemp           0.872110
Rainfall          2.203846
Evaporation      43.002495
Sunshine         47.851152
WindGustDir       7.054468
WindGustSpeed     7.011255
WindDir9am        7.280352
WindDir3pm        2.898195
WindSpeed9am      1.203080
WindSpeed3pm      2.107600
Humidity9am       1.830646
Humidity3pm       3.132918
Pressure9am      10.340594
Pressure3pm      10.321934
Cloud9am         38.320795
Cloud3pm         40.732848
Temp9am           1.222722
Temp3pm           2.516156
RainToday         0.000000
RainTomorrow      0.000000
dtype: float64


In [5]:
# union and cherk object and null data
xy_all = pd.concat([train, test], axis=0)
cat_features = [
    "Date",
    "Location",
    
    "WindGustDir",
    "WindDir9am",
    "WindDir3pm",

    "RainToday",
    "RainTomorrow",

    "Evaporation",
    "Sunshine",
    "Cloud9am",
    "Cloud3pm",
]

In [6]:
# processing value
# object(string) type to in32, NAN and missing value to -1
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(
    dtype=np.int32,
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-1,
).set_output(transform="pandas")

xy_all[cat_features] = ordinal_encoder.fit_transform(xy_all[cat_features])
print("xy_all size:", xy_all.shape)

xy_all size: (145460, 24)


In [7]:
# Split
# RainTomorrow with -1 is x_test
# RainTomorrow without -1 is xy_train

xy_train = xy_all[xy_all["RainTomorrow"] != -1]
x_train = xy_train.drop(columns=["RainTomorrow"])
y_train = xy_train["RainTomorrow"]
x_test = xy_all[xy_all["RainTomorrow"] == -1].drop(columns="RainTomorrow")

In [8]:
param = {
    "subsample": 0.9631662121183732,
    "reg_lambda": 0.06993883748486984,
    "reg_alpha": 0.42445602287772566,
    "n_estimators": 495,
    "max_depth": 8,
    "learning_rate": 0.03984212824100648,
    "colsample_bytree": 0.6368624805550515,
}

In [9]:
# set model
import xgboost as xgb

model = xgb.XGBRegressor(**param)

# train
model.fit(x_train, y_train)

# pred
y_xgb_pred = model.predict(x_test)

In [10]:
y_pred_str = np.where(y_xgb_pred > 0.5, "Yes", "No")
# save .csv
pd.DataFrame({"id": x_test["id"], "RainTomorrow": y_pred_str}).to_csv(
    "./output/XGB.csv", index=False
)