In [122]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [123]:
data_folder = Path("../data/")
train = pd.read_csv(data_folder / "train.csv")
test = pd.read_csv(data_folder / "test.csv")

df = pd.concat([train, test], axis=0)

# fill NaNs
df["Embarked"].fillna(("S"), inplace=True)
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(np.mean(df["Fare"]), inplace=True)

# handle categorical data
df = pd.concat([df, pd.get_dummies(df["Sex"])], axis=1)
df = pd.concat([df, pd.get_dummies(df["Embarked"], prefix="embarked")], axis=1)

# new columns
df["FamilySize"] = df["Parch"] + df["SibSp"] + 1
df["IsAlone"] = np.where(df["FamilySize"] == 1, 1, 0)
df["IsChild"]=np.where(df["Age"]<=10,1,0)

# drop unneeded columns
df = df.drop(["Name", "Ticket", "Sex", "Cabin", "Embarked","SibSp","Parch"], axis=1)
df.head(15)


Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,female,male,embarked_C,embarked_Q,embarked_S,FamilySize,IsAlone,IsChild
0,1,0.0,3,22.0,7.25,0,1,0,0,1,2,0,0
1,2,1.0,1,38.0,71.2833,1,0,1,0,0,2,0,0
2,3,1.0,3,26.0,7.925,1,0,0,0,1,1,1,0
3,4,1.0,1,35.0,53.1,1,0,0,0,1,2,0,0
4,5,0.0,3,35.0,8.05,0,1,0,0,1,1,1,0
5,6,0.0,3,28.0,8.4583,0,1,0,1,0,1,1,0
6,7,0.0,1,54.0,51.8625,0,1,0,0,1,1,1,0
7,8,0.0,3,2.0,21.075,0,1,0,0,1,5,0,1
8,9,1.0,3,27.0,11.1333,1,0,0,0,1,3,0,0
9,10,1.0,2,14.0,30.0708,1,0,1,0,0,2,0,0


In [124]:
train=df[:len(train)]
test=df[len(train):]

y_train=train['Survived']
X_train=train.drop(['Survived','PassengerId'],axis=1)
X_test=test.drop(['Survived','PassengerId'],axis=1)

In [125]:
# 試行するパラメータを羅列
params = {
    'n_estimators': [10, 30, 50, 100],
    'max_depth'   : [5, 7, 9, 11]
}

model = RandomForestClassifier(random_state=46)
clf = GridSearchCV(estimator = model,
                   cv=5,
                   param_grid=params,
                   verbose=3,
                   n_jobs=-1,
                   scoring="accuracy",
                   return_train_score=True,
                   refit=True)
clf.fit(X_train,y_train)

RandomForestClassifier()

In [None]:
clf.best_estimator_

In [None]:
clf.best_score_

In [126]:
predict=clf.predict(X_test).astype(int)

ans_dict={"PassengerId":test["PassengerId"].to_numpy(),"Survived":predict}
ans_df=pd.DataFrame(ans_dict)
ans_df.to_csv(data_folder / "ans.csv",index=False)

In [127]:
#without familycount and isalone 0.765 (highest)