# 建模

In [None]:

import pandas as pd
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('../input/train_final.csv')
test = pd.read_csv('../input/test_final.csv')

X = train.drop(['Survived'], axis=1)
y = train['Survived']


def save_model(model, name):
    '''保存模型'''
    joblib.dump(model, f'../models/{name}.pkl')
    print(f'{name} is successfully saved!')
    return True


def submit(model_name : str,
           test : pd.DataFrame):
    ''' 
    保存提交（预测）的数据\n
    model_name: 模型的名称（只传入点号之前的名称）\n
    test: 需要预测的数据集
    '''
    # 载入模型
    model = joblib.load(f'../models/{model_name}.pkl')
    # 使用模型预测
    y_pred = model.predict(test)
    # 保存提交
    submission = pd.read_csv('../submission/submission.csv')
    submission['Survived'] = y_pred.astype(int)
    submission.to_csv(f'../submission/{model_name}.csv', index=None)
    print(f'{model_name} is successfully used to test!')
    return True

In [None]:
names = [
    "Nearest Neighbors",
    "Gaussian Process",
    "Neural Net",
    "AdaBoost",
    "",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# RandomForest

In [30]:

def best_randomforest_clf(X : pd.DataFrame,
                          y : pd.DataFrame, 
                          scoring = 'accuracy',
                          n_estimators = 100,  # 树不断增加，一般认为就不会过拟合，尽量增加树的棵树
                          min_weight_fraction_leaf = 0,
                          min_impurity_decrease=[0.0],
                          bootstrap = True,
                          oob_score = False,
                          class_weight = None):
    '''
    ***随机森林*** 分类器模型寻优\n
    X：输入模型的特征\n
    y：输入模型的标签\n
    scoring：模型的评价标准，可取的值为 ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']\n
    min_weight_fraction_leaf: 叶子节点的最小样本数\n
    min_impurity_decrease: 最小不纯度减少量\n
    bootstrap：是否使用有放回抽样\n
    oob_score: 袋外样本评估模型\n
    class_weight: 类别权重默认为None，可选 ['balanced']\n
    return: 输出最佳的随机森林模型
    '''
    # 参数网格
    param_grid = {"criterion": ["gini", "entropy", 'log_loss'],
                  'max_depth': [None, 10, 20, 30],
                  'min_samples_split': [2, 5, 10],
                  'min_samples_leaf': [1, 2, 4],
                  'max_features': [None, "sqrt", "log2", 0.5],
                  "max_leaf_nodes": [None, 10, 20],
                  "min_impurity_decrease": min_impurity_decrease,}
    # 模型
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   min_weight_fraction_leaf=min_weight_fraction_leaf,
                                   bootstrap=bootstrap,
                                   oob_score=oob_score,
                                   n_jobs=-1,
                                   random_state=42,
                                   class_weight=class_weight)
    
    # 使用GridSearchCV进行超参数调优
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, 
                               n_jobs=-1, cv=5, verbose=2)
    grid_search.fit(X, y)
    
    # 打印最佳参数和得分
    print(f'RandomForest Best Params: ',grid_search.best_params_)
    print(f'RandomForest Best Score: ', grid_search.best_score_)
    
    return grid_search.best_estimator_

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
RandomForest Best Params:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': 0.5, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 10}
RandomForest Best Score:  0.8451133011110414


True

In [None]:
randomforest = best_randomforest_clf(X, y, scoring='accuracy')
save_model(randomforest, 'randomforest')
submit('randomforest', test)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


# DecisionTree

In [None]:

def best_decisiontree_clf(X : pd.DataFrame,
                          y : pd.DataFrame, 
                          scoring='accuracy'):
    '''
    ***决策树*** 分类器模型寻优\n
    X：输入模型的特征\n
    y：输入模型的标签\n
    scoring：模型的评价标准，可取的值为 ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']\n
    return: 输出最佳的决策树模型
    '''
    model = RandomForestClassifier()
    # 参数网格
    param_grid = {"criterion": ["gini", "entropy", 'log_loss'],  # 分裂标准
                  "splitter": ["best", "random"],    # 分裂策略
                  "max_depth": [None, 3, 5, 10],     # 树的最大深度
                  "min_samples_split": [2, 5, 10],   # 节点分裂的最小样本数
                  "min_samples_leaf": [1, 2, 5],     # 叶子节点的最小样本数
                  "min_weight_fraction_leaf": [0.0],  # 叶子节点的最小权重比例 (样本不平衡时用)
                  "max_features": [None, 'auto',"sqrt", "log2", 0.5],  # 分裂时考虑的最大特征数
                  "max_leaf_nodes": [None, 10, 20],  # 最大叶子节点数
                  "min_impurity_decrease": [0.0, 0.01, 0.1],  # 最小不纯度减少量
                  "class_weight": [None, "balanced"],  # 类别权重
                  "ccp_alpha": [0.0, 0.01, 0.1]}     # 代价复杂度剪枝参数
    # 模型
    model = DecisionTreeClassifier(random_state=42)
    
    # 使用GridSearchCV进行超参数调优
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, 
                               n_jobs=-1, cv=5, verbose=2)
    grid_search.fit(X, y)
    
    # 打印最佳参数和得分
    print(f'DecisionTree Best Params: ',grid_search.best_params_)
    print(f'DecisionTree Best Score: ', grid_search.best_score_)
    
    return grid_search.best_estimator_

Fitting 5 folds for each of 58320 candidates, totalling 291600 fits
DecisionTree Best Params:  {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 0.5, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'random'}
DecisionTree Best Score:  0.8383654510074697


True

In [None]:
decisiontree = best_decisiontree_clf(X, y, scoring='accuracy')
save_model(decisiontree, 'decissiontree')

submit('decissiontree', test)

# SVM

In [None]:
def best_svm_clf(X : pd.DataFrame,
             y : pd.DataFrame, 
             scoring='accuracy',
             kernel = 'rbf'):
    '''
    SVC参数寻优\n
    X：输入模型的特征\n
    y：输入模型的标签\n
    kernel：选择使用哪种和函数，取值为 ['rbf', 'linear', 'poly', 'sigmoid']\n
    scoring：模型的评价标准，可取的值为 ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    return: 输出最佳的模型
    '''
    # 定义参数网格
    param_grid = {'C': [0.025, 0.05, 0.1, 0.5, 1, 10],
                'kernel': [kernel],
                'degree': [2, 3, 4],
                'gamma': ['scale', 'auto'],
                'coef0': [0, 0.1, 0.5, 1]}
    model = SVC()

    # 使用GridSearchCV进行超参数调优
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, 
                               n_jobs=-1, cv=5)
    grid_search.fit(X, y)
    print(f'svc_{kernel} Best Params: ',grid_search.best_params_)
    print(f'svc_{kernel} Best Score: ', grid_search.best_score_)
    return grid_search.best_estimator_

# SVC
svc_rbf = best_svm_clf(X, y, scoring='accuracy', kernel='rbf')
svc_linear = best_svm_clf(X, y, scoring='accuracy', kernel='linear')
svc_poly = best_svm_clf(X, y, scoring='accuracy', kernel='poly')
svc_sigmoid = best_svm_clf(X, y, scoring='accuracy', kernel='sigmoid')

# 保存模型
joblib.dump(svc_rbf, '../models/svc_rbf.pkl')
joblib.dump(svc_linear, '../models/svc_linear.pkl')
joblib.dump(svc_poly, '../models/svc_poly.pkl')
joblib.dump(svc_sigmoid, '../models/svc_sigmoid.pkl')

svc_rbf Best Params:  {'C': 0.5, 'coef0': 0, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
svc_rbf Best Score:  0.8338773460548616
svc_linear Best Params:  {'C': 0.05, 'coef0': 0, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
svc_linear Best Score:  0.8282656455966355
svc_poly Best Params:  {'C': 0.025, 'coef0': 1, 'degree': 4, 'gamma': 'auto', 'kernel': 'poly'}
svc_poly Best Score:  0.8372418555018518
svc_sigmoid Best Params:  {'C': 0.1, 'coef0': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'sigmoid'}
svc_sigmoid Best Score:  0.7654321762601218


['../models/svc_sigmoid.pkl']

In [None]:

    
submit('svc_linear', test)
submit('svc_poly', test)
submit('svc_rbf', test)
submit('svc_sigmoid', test)

# Naive Bayes

In [None]:
# Naive Bayes