In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer


In [2]:
titanic = pd.read_csv("train.csv")

In [3]:
titanic_pid = titanic.pop("PassengerId")

### Let's create some usefull functions

In [10]:
def prepareData(titanic):
    titanic = titanic.copy()
    titanic["prefix"] = titanic.Name.str.extract("([a-zA-Z]+\.)")
    mean_age ={}
    for age in titanic.prefix.unique():
        mean_age[age] = titanic.loc[(titanic.prefix == age) & titanic.Age.notna(), "Age"].mean()
    titanic["Age"] = titanic["Age"].fillna(titanic["prefix"].map(mean_age))
    titanic['Cletter'] = titanic['Cabin'].str.extract('([A-Za-z]+)', expand=False)
    titanic['СNumber'] = titanic['Cabin'].str.extract('(\d+)', expand=False)
    titanic['СNumber'] = pd.to_numeric(titanic.СNumber)
    titanic['Cletter'] = titanic.groupby('prefix')['Cletter'].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else "N_A"))
    titanic['СNumber'] = titanic.groupby('prefix')['СNumber'].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 0))
    titanic = titanic.drop(columns=["Cabin"])
    titanic['Embarked'].fillna(titanic['Embarked'].mode().iloc[0], inplace=True)
    titanic.Sex = titanic.Sex.transform(lambda x: 1 if x == "male" else 0)
    titanic.Sex = pd.to_numeric(titanic.Sex)
    titanic.Ticket = titanic.Ticket.str.extract("(\d+)")
    titanic.Ticket = pd.to_numeric(titanic.Ticket)
    titanic["parenthesis"] = titanic.Name.str.contains("\(")
    titanic["isAlone"] =  (titanic.SibSp == 0) & (titanic.Parch == 0)
    titanic = titanic.drop(columns=["Name"])
    return titanic

In [11]:
def makeAtest(file, model, result="res.csv"):
    test = pd.read_csv(file)
    ids = test.pop("PassengerId")
    test = prepareData(test)
    y_pred = model.predict(test)
    test["PassengerId"] = ids
    test["Survived"] = y_pred
    test[["PassengerId", "Survived"]].to_csv(result, index=False)

In [12]:
titanic = prepareData(titanic)

In [13]:
titanic.isna().sum()

Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         4
Fare           0
Embarked       0
prefix         0
Cletter        0
СNumber        0
parenthesis    0
isAlone        0
dtype: int64

### It is time for Forest

In [15]:
y = titanic.pop("Survived")

In [16]:
X_train, X_test, y_train, y_test = train_test_split(titanic, y,test_size=.2,random_state = 42)

In [17]:
X_cat = titanic.select_dtypes(exclude="number").copy()
X_num = titanic.select_dtypes(include="number").copy()

In [18]:
numeric_pipe = make_pipeline(
    SimpleImputer())

categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [19]:
full_pipeline = make_pipeline(preprocessor,
                              RandomForestClassifier())


In [20]:
param_grid = {
    "randomforestclassifier__criterion":["gini"],
    "randomforestclassifier__min_samples_split" : range(2,10),
    "randomforestclassifier__min_samples_leaf" : range(1,10),
    "randomforestclassifier__n_estimators": range(150,300,20)
}


In [21]:
searchRF = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1,
                      n_jobs=multiprocessing.cpu_count()-1)

In [None]:
searchRF.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits


### Saving result

In [None]:
makeAtest("test.csv", searchRF, "RF_CV_5760.csv")