In [8]:
# import lib
import pandas as pd
import numpy as np

# read csv and delete RainTomorrow rows with empty
train = pd.read_csv("./data/train.csv")
train = train.dropna(subset=["RainTomorrow"])
test = pd.read_csv("./data/test.csv")

# union and cherk object and null data
xy_all = pd.concat([train, test], axis=0)
cat_features = [
    "Date",
    "Location",
    "WindGustDir",
    "WindDir9am",
    "WindDir3pm",
    "RainToday",
    "RainTomorrow",
    "Evaporation",
    "Sunshine",
    "Cloud9am",
    "Cloud3pm",
]

# processing value
# object(string) type to in32, NAN and missing value to -1
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(
    dtype=np.int32,
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-1,
).set_output(transform="pandas")

xy_all[cat_features] = ordinal_encoder.fit_transform(xy_all[cat_features])
print("xy_all size:", xy_all.shape)

# Split
# RainTomorrow with -1 is x_test
# RainTomorrow without -1 is xy_train

xy_train = xy_all[xy_all["RainTomorrow"] != -1]
x_train = xy_train.drop(columns=["RainTomorrow"])
y_train = xy_train["RainTomorrow"]
x_test = xy_all[xy_all["RainTomorrow"] == -1].drop(columns="RainTomorrow")

xy_all size: (143169, 24)


Unnamed: 0,id,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,1,164,2,7.4,25.1,0.0,-1,-1,14,44.0,...,22.0,44.0,25.0,1010.6,1007.8,-1,-1,17.2,24.3,0
1,2,175,2,12.9,25.7,0.0,-1,-1,15,46.0,...,26.0,38.0,30.0,1007.6,1008.7,-1,2,21.0,23.2,0
2,3,178,2,9.2,28.0,0.0,-1,-1,4,24.0,...,9.0,45.0,16.0,1017.6,1012.8,-1,-1,18.1,26.5,0
3,4,179,2,17.5,32.3,1.0,-1,-1,13,41.0,...,20.0,82.0,33.0,1010.8,1006.0,7,8,17.8,29.7,0
4,5,180,2,14.6,29.7,0.2,-1,-1,14,56.0,...,24.0,55.0,23.0,1009.2,1005.4,-1,-1,20.6,28.9,0
5,6,181,2,14.3,25.0,0.0,-1,-1,13,50.0,...,24.0,49.0,19.0,1009.6,1008.2,1,-1,18.1,24.6,0
6,7,182,2,7.7,26.7,0.0,-1,-1,13,35.0,...,17.0,48.0,19.0,1013.4,1010.1,-1,-1,16.3,25.5,0
7,8,183,2,9.7,31.9,0.0,-1,-1,6,80.0,...,28.0,42.0,9.0,1008.9,1003.6,-1,-1,18.3,30.2,0
8,9,154,2,13.1,30.1,1.4,-1,-1,13,28.0,...,11.0,58.0,27.0,1007.0,1005.7,-1,-1,20.1,28.2,1
9,10,155,2,13.4,30.4,0.0,-1,-1,3,30.0,...,6.0,48.0,22.0,1011.8,1008.7,-1,-1,20.4,28.8,0


In [5]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(x_train, y_train)

In [6]:
# predict
y_pred = model.predict(x_test)

y_pred_str = np.where(y_pred > 0.5, "Yes", "No")

In [7]:
# save .csv
pd.DataFrame({"id": x_test["id"], "RainTomorrow": y_pred_str}).to_csv(
    "./output/RF.csv", index=False
)