In [1]:
import pandas as pd
import numpy as np

In [2]:
db_train = pd.read_csv("train.csv")
db_test = pd.read_csv("test.csv")

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [4]:
list(db_train.columns)

['row_id',
 'mintemp',
 'maxtemp',
 'rainfall',
 'evaporation',
 'sunshine',
 'windgustdir',
 'windgustspeed',
 'winddir9am',
 'winddir3pm',
 'windspeed9am',
 'windspeed3pm',
 'humidity9am',
 'humidity3pm',
 'pressure9am',
 'pressure3pm',
 'cloud9am',
 'cloud3pm',
 'temp9am',
 'temp3pm',
 'raintoday',
 'raintomorrow']

In [5]:
target_col = "raintomorrow"
id_cols = ["row_id"]

In [6]:
Y_train = db_train[target_col].values
db_train.drop(columns=[target_col], inplace=True)

In [7]:
numeric_cols = [x for x in db_train.select_dtypes(include=np.number).columns.tolist() if x not in id_cols]
cat_cols = [x for x in db_train.columns if x not in numeric_cols and x not in id_cols]

In [8]:
X_train = db_train[numeric_cols].fillna(-9999).values
X_test = db_test[numeric_cols].fillna(-9999).values

In [9]:
clf = RandomForestClassifier(n_estimators=200, max_depth=4)
print(np.mean(cross_val_score(clf, X_train, Y_train, cv=5, scoring='roc_auc', n_jobs=-1)))

0.8308867848823125


In [10]:
clf.fit(X_train, Y_train)
preds = clf.predict_proba(X_test)[:, 1]

In [11]:
submit_data = db_test[["row_id"]].copy()
submit_data.loc[:, "raintomorrow"] = preds
submit_data.to_csv("public_baseline_submission.csv", index=None)