In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# 1.获取数据
data = pd.read_csv('https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic.txt')

In [3]:
data.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [4]:
# 2.数据基本处理
# 2.1 确定特征值,目标值
x = data[['pclass', 'age', 'sex']]
y = data[['survived']]
# 2.2 缺失值处理
x['age'].fillna(x['age'].mean(), inplace=True)
# 2.3 数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22, test_size=0.2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['age'].fillna(x['age'].mean(), inplace=True)


In [5]:
# 3.特征工程(字典特征抽取)
# 因为特征中出现类别符号(pclass)，需要进行one-hot编码处理(DictVectorizer)
transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train.to_dict(orient="records"))
x_test = transfer.fit_transform(x_test.to_dict(orient="records"))
transfer.get_feature_names_out()

array(['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female',
       'sex=male'], dtype=object)

In [14]:
# 4.机器学习(随机深林)
estimator = RandomForestClassifier()
param = {"n_estimators": [120, 200, 300, 500, 800, 1200], "max_depth": [5, 8, 15, 25, 30]}
estimator = GridSearchCV(estimator, param_grid=param, cv=5, n_jobs=-1)  # 因为存在超参数，所以可以使用网格搜索
estimator.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [15]:
# 5.模型评估
"准确率:", estimator.score(x_test, y_test)

('准确率:', 0.7908745247148289)

In [16]:
"最好模型:", estimator.best_estimator_

('最好模型:', RandomForestClassifier(max_depth=5, n_estimators=120))