In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import GridSearchCV

In [45]:
# 1)获取数据
titanic = pd.read_csv('/home/data/t120455/workspace/git_distrib/ML_DL_playground/datasets/titanic/train.csv')

In [46]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [47]:
# 查看数据集基本特征
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [48]:
# 查看数据是否有缺失
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [49]:
# 2)筛选特征值和目标值
x = titanic[['Pclass','Age','Sex']]
y = titanic['Survived']

In [50]:
# 3)数据处理
#     缺失值处理
x['Age'].fillna(x['Age'].mean(), inplace=True)  # 填平均值

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Age'].fillna(x['Age'].mean(), inplace=True)  # 填平均值


In [51]:
x.isnull().sum()

Pclass    0
Age       0
Sex       0
dtype: int64

In [52]:
x.head()

Unnamed: 0,Pclass,Age,Sex
0,3,22.0,male
1,1,38.0,female
2,3,26.0,female
3,1,35.0,female
4,3,35.0,male


In [63]:
# 4)划分数据集
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=123)

In [64]:
x_train

Unnamed: 0,Pclass,Age,Sex
329,1,16.0,female
749,3,31.0,male
203,3,45.5,male
421,3,21.0,male
97,1,23.0,male
...,...,...,...
98,2,34.0,female
322,2,30.0,female
382,3,32.0,male
365,3,30.0,male


In [65]:
#     多类别变量(特征) -转换成one-hot编码 -若有多个特征先转换成字典 类sklearn自带数据集继承自字典格式
#     特征 -> 字典类型
x.to_dict(orient='records')  # 指定格式

[{'Pclass': 3, 'Age': 22.0, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 38.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 26.0, 'Sex': 'female'},
 {'Pclass': 1, 'Age': 35.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 35.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 54.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 2.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 27.0, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 14.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 4.0, 'Sex': 'female'},
 {'Pclass': 1, 'Age': 58.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 20.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 39.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 14.0, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 55.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 2.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 31.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 35.0, 'Sex': 'male'},
 {'Pclass': 2, 'Ag

In [66]:
x_train.to_dict(orient='records')

[{'Pclass': 1, 'Age': 16.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 31.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 45.5, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 21.0, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 23.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 52.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 3.0, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 16.0, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 30.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 21.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 16.0, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 19.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 25.0, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 32.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 30.5, 'Sex': 'female'},
 {'Pclass': 1, 'Age': 21.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 25.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 35.0, 'Sex': 'female'},
 {'Pclass': 2, 'Age': 17.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 16.0, 'Sex': 

In [58]:
# 5)特征工程 字典特征抽取   决策树不需标准化
# 将字典对象向量化
# https://blog.csdn.net/Jon_Sheng/article/details/79693971
transfer = DictVectorizer()  # eg:Pclass:1/2/3 - 三个特征而非一个
x_train = transfer.fit_transform(x_train.to_dict(orient='records'))
x_test = transfer.fit_transform(x_test.to_dict(orient='records'))

In [59]:
# x_train
x_test

<179x4 sparse matrix of type '<class 'numpy.float64'>'
	with 537 stored elements in Compressed Sparse Row format>

In [67]:
# 或者 sparse=False意思是不产生稀疏矩阵

# 4)划分数据集
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=123)
# 5)特征工程 字典特征抽取   决策树不需标准化
# 将字典对象向量化
transfer = DictVectorizer(sparse=False)  # eg:Pclass:1/2/3 - 三个特征而非一个
x_train = transfer.fit_transform(x_train.to_dict(orient='records'))
x_test = transfer.fit_transform(x_test.to_dict(orient='records'))                       
print(transfer.feature_names_)#查看转换后的列名
print(x_test)

['Age', 'Pclass', 'Sex=female', 'Sex=male']
[[ 1.          3.          1.          0.        ]
 [29.69911765  3.          0.          1.        ]
 [30.          1.          0.          1.        ]
 [61.          1.          0.          1.        ]
 [27.          3.          0.          1.        ]
 [46.          2.          0.          1.        ]
 [40.          2.          1.          0.        ]
 [27.          2.          1.          0.        ]
 [18.          3.          1.          0.        ]
 [29.          3.          1.          0.        ]
 [ 9.          3.          1.          0.        ]
 [28.          3.          1.          0.        ]
 [52.          1.          1.          0.        ]
 [24.5         3.          0.          1.        ]
 [50.          1.          1.          0.        ]
 [ 0.75        3.          1.          0.        ]
 [58.          1.          1.          0.        ]
 [29.69911765  1.          0.          1.        ]
 [45.          3.          1.         

发现DictVectorizer只识别了Sex没识别Pclass

In [68]:
# 6)决策树预估流程
estimator = DecisionTreeClassifier(criterion='entropy')
# +加入 网格搜索 和 交叉验证
param_dic = {'max_depth':[5,7,9,11,13,15,17,19]}
es_gscv = GridSearchCV(estimator, param_grid = param_dic, cv=10)
es_gscv.fit(x_train, y_train)
# 仅仅在训练集上交叉验证 选择最佳参数

交叉验证 (cross-validation) 是用于评估模型性能的方法，它将数据集分为 k 个部分，每次使用其中一部分作为训练数据，剩余的部分作为测试数据。这样可以将模型的性能分散在所有训练集上，从而更准确地评估模型的效果。

在这个例子中，我们使用了 GridSearchCV 类来进行交叉验证。它首先对参数进行网格搜索 (grid search)，然后使用交叉验证来评估每个参数组合的性能。具体来说，GridSearchCV 会创建一个包含预设参数的网格，然后使用交叉验证来测试每个参数组合。最后，它会选择性能最好的参数组合作为最终的预估器。

在这个例子中，我们使用了 10-fold 交叉验证，这意味着数据集将被分成 10 个部分，每次使用其中一部分作为训练数据，剩余的部分作为测试数据。这样我们可以获得 10 个不同的测试结果，然后计算平均值作为模型的最终性能。

In [69]:
# 7)模型评估
score = es_gscv.score(x_test, y_test)
print(score)

0.8379888268156425


sklearn.model_selection.GridSearchCV 返回的 score 是交叉验证的评分指标。在 GridSearchCV 中，通过交叉验证对每组参数的模型进行评分，然后将这些评分进行平均，最终得到一个综合的评分指标。这个评分指标的具体类型取决于你在 GridSearchCV 中设置的 scoring 参数，默认情况下是分类任务中的准确率（accuracy），回归任务中的 R 平方值（R-squared）。你可以通过设置 scoring 参数来指定不同的评分指标，比如 F1 分数、均方误差（MSE）等。

In [38]:
es_gscv.predict_proba(x_test)

array([[0.        , 1.        ],
       [0.90361446, 0.09638554],
       [0.78571429, 0.21428571],
       [1.        , 0.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [0.5       , 0.5       ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.85882353, 0.14117647],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.78571429, 0.21428571],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.38888889, 0.61111111],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [1.        , 0.        ],
       [0.85882353, 0.14117647],
       [0.85882353, 0.14117647],
       [0.        , 1.        ],
       [0.85882353, 0.14117647],
       [0.90361446, 0.09638554],
       [0.

In [37]:
from sklearn.metrics import roc_auc_score
# 计算AUC
y_true = y_test
y_pred = es_gscv.predict_proba(x_test)[:, 1]
# https://blog.csdn.net/pearl8899/article/details/109829306
auc_value = roc_auc_score(y_true, y_pred)
print("AUC:", auc_value)

AUC: 0.8416329284750338


In [None]:
# 可视化决策树
# export_graphviz(es_gscv, out_file='titanic_tree.dot',feature_names=transfer.get_feature_names_out())

In [39]:
# +结果分析 best_params_ best_score_ best_estimator_ cv_results_
print("最佳参数:\n", es_gscv.best_params_)
print("最佳结果:\n", es_gscv.best_score_)
print("最佳估计器:\n", es_gscv.best_estimator_)
print("交叉验证结果:\n", es_gscv.cv_results_)

最佳参数:
 {'max_depth': 11}
最佳结果:
 0.8046948356807512
最佳估计器:
 DecisionTreeClassifier(criterion='entropy', max_depth=11)
交叉验证结果:
 {'mean_fit_time': array([0.00184577, 0.00172818, 0.00188425, 0.00191445, 0.00201602,
       0.00207746, 0.00202241, 0.00205185]), 'std_fit_time': array([3.59936685e-04, 5.10831766e-05, 5.49170117e-05, 2.91375988e-05,
       6.79904244e-05, 5.99922215e-05, 4.28594384e-05, 6.72785336e-05]), 'mean_score_time': array([0.00084486, 0.00066404, 0.00067563, 0.00064042, 0.00066726,
       0.00067604, 0.00066624, 0.00068848]), 'std_score_time': array([3.22777571e-04, 4.71497440e-05, 3.39193144e-05, 1.80309558e-05,
       2.68019070e-05, 4.27127733e-05, 3.23654780e-05, 5.36451393e-05]), 'param_max_depth': masked_array(data=[5, 7, 9, 11, 13, 15, 17, 19],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 5}, {'max_depth': 7}, {'max_depth': 9}, {'max_depth': 11}, {'max_depth':

前面发现DictVectorizer只识别了Sex没识别Pclass

用其它方法实现看看是否对结果有影响

In [70]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    int64  
 1   Age     891 non-null    float64
 2   Sex     891 non-null    object 
dtypes: float64(1), int64(1), object(1)
memory usage: 21.0+ KB


In [72]:
# dummy
tmp1 = pd.DataFrame()
tmp1 = pd.get_dummies(x)
tmp1.head()
# pd.get_dummies也没识别出来

Unnamed: 0,Pclass,Age,Sex_female,Sex_male
0,3,22.0,False,True
1,1,38.0,True,False
2,3,26.0,True,False
3,1,35.0,True,False
4,3,35.0,False,True


发现原因：Pclass类型为int64 而非object或者category dtype 所以识别不出来

In [73]:
# 使用 astype() 方法将类型从 int 更改为 object
x['Pclass'] = x['Pclass'].astype('object')
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    object 
 1   Age     891 non-null    float64
 2   Sex     891 non-null    object 
dtypes: float64(1), object(2)
memory usage: 21.0+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Pclass'] = x['Pclass'].astype('object')


In [77]:
# dummy
x_dummy = pd.get_dummies(x)
x_dummy.head()

Unnamed: 0,Age,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,22.0,False,False,True,False,True
1,38.0,True,False,False,True,False
2,26.0,False,False,True,True,False
3,35.0,True,False,False,True,False
4,35.0,False,False,True,False,True


In [78]:
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x_dummy,y,test_size=0.2,random_state=123)

In [79]:
x_train

Unnamed: 0,Age,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
329,16.0,True,False,False,True,False
749,31.0,False,False,True,False,True
203,45.5,False,False,True,False,True
421,21.0,False,False,True,False,True
97,23.0,True,False,False,False,True
...,...,...,...,...,...,...
98,34.0,False,True,False,True,False
322,30.0,False,True,False,True,False
382,32.0,False,False,True,False,True
365,30.0,False,False,True,False,True


In [80]:
# 6)决策树预估流程
estimator = DecisionTreeClassifier(criterion='entropy')
# +加入 网格搜索 和 交叉验证
param_dic = {'max_depth':[5,7,9,11,13,15,17,19]}
es_gscv = GridSearchCV(estimator, param_grid = param_dic, cv=10)
es_gscv.fit(x_train, y_train)

# 7)模型评估
score = es_gscv.score(x_test, y_test)
print(score)

0.8379888268156425


发现其实和之前一样，我觉得这可能和决策树的学习方式有关