随机森林

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv('./train.csv')

In [4]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [6]:
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [7]:
data.fillna(0, inplace=True)

In [8]:
data['Sex'] = [1 if x == 'male' else 0 for x in data.Sex]

In [9]:
# 独热编码 防止数据产生线性关系
data['p1'] = np.array(data['Pclass'] == 1).astype(np.int32)
data['p2'] = np.array(data['Pclass'] == 2).astype(np.int32)
data['p3'] = np.array(data['Pclass'] == 3).astype(np.int32)

In [10]:
del data['Pclass']

In [11]:
data.Embarked.unique()

array(['S', 'C', 'Q', 0], dtype=object)

In [12]:
data['e1'] = np.array(data['Embarked'] == 'S').astype(np.int32)
data['e2'] = np.array(data['Embarked'] == 'C').astype(np.int32)
data['e3'] = np.array(data['Embarked'] == 'Q').astype(np.int32)

In [13]:
del data['Embarked']

In [14]:
data.values.dtype

dtype('float64')

In [15]:
data_train = data[[x for x in data.columns if x != 'Survived']].values

In [16]:
data_target = data['Survived'].values.reshape(len(data), 1)

In [17]:
np.shape(data_train), np.shape(data_target)

((891, 11), (891, 1))

In [18]:
x_train, x_test, y_train, y_test = train_test_split(data_train, data_target, test_size=0.2)

In [19]:
x_train.shape, x_test.shape

((712, 11), (179, 11))

In [21]:
model = RandomForestClassifier(n_estimators=100, n_jobs=4)

In [22]:
model.fit(x_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
model.score(x_test, y_test)

0.8268156424581006

In [24]:
model.feature_importances_

array([0.25085681, 0.26322448, 0.04893817, 0.04738321, 0.26293827,
       0.02615327, 0.01683334, 0.04873034, 0.01343788, 0.01306713,
       0.00843709])

In [25]:
model.predict(x_test[0].reshape(1, -1))

array([0], dtype=int64)

In [26]:
model.estimators_

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1972093618, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=367474177, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, m

交叉验证

In [34]:
n_estimators = range(80, 130)

In [35]:
param_grid = {'n_estimators':n_estimators}

In [36]:
model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)

In [37]:
model.fit(data_train, data_target.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [38]:
model.best_params_

{'n_estimators': 96}

In [39]:
model.best_score_

0.8215680120519743