# 建模

In [2]:

import pandas as pd
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier # 
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
train = pd.read_csv('../input/train_final.csv')
test = pd.read_csv('../input/test_final.csv')

X = train.drop(['Survived'], axis=1)
y = train['Survived']


def save_model(model, name):
    '''保存模型'''
    joblib.dump(model, f'../models/{name}.pkl')
    print(f'{name} is successfully saved!')
    return True


def submit(model_name : str,
           test : pd.DataFrame):
    ''' 
    保存提交（预测）的数据\n
    model_name: 模型的名称（只传入点号之前的名称）\n
    test: 需要预测的数据集
    '''
    # 载入模型
    model = joblib.load(f'../models/{model_name}.pkl')
    # 使用模型预测
    y_pred = model.predict(test)
    # 保存提交
    submission = pd.read_csv('../submission/submission.csv')
    submission['Survived'] = y_pred.astype(int)
    submission.to_csv(f'../submission/{model_name}.csv', index=None)
    print(f'{model_name} is successfully used to test!')
    return True

In [None]:
names = [
    "Nearest Neighbors",
    "Gaussian Process",
    "Neural Net",
    "AdaBoost",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    QuadraticDiscriminantAnalysis(),
]

# Naive Bayes

In [11]:
def best_bayes_clf(X : pd.DataFrame,
                   y : pd.DataFrame, 
                   scoring = 'accuracy',
                   model_name = 'GaussianNB'):
    '''
    ***朴素贝叶斯*** 分类器模型寻优\n
    X：输入模型的特征\n
    y：输入模型的标签\n
    scoring：模型的评价标准，可取的值为 ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']\n
    model: 
        GaussianNB：假设特征服从高斯分布（正态分布）。
        MultinomialNB：适用于离散特征（如文本分类中的词频或 TF-IDF 值）。
        BernoulliNB：适用于二值特征（如文本分类中的是否出现某个词）。
        
    return: 输出最佳的随机森林模型
    '''
    # 模型
    if model_name == 'GaussianNB':
        model = GaussianNB()
        param_grid = {'var_smoothing':[1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}
        
    elif model_name == 'MultinomialNB':
        model = MultinomialNB()
        param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # 平滑参数
                      'fit_prior': [True, False]}  # 是否学习类别的先验概率
        
    elif model_name == 'BernoulliNB':
        model = BernoulliNB()
        param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # 平滑参数
                      'fit_prior': [True, False],          # 是否学习类别的先验概率
                      'binarize': [0.0, 0.5, 1.0]}          # 特征二值化的阈值
    
    # 使用GridSearchCV进行超参数调优
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, 
                               n_jobs=-1, cv=5, verbose=2)
    grid_search.fit(X, y)   
    # 打印最佳参数和得分
    print(f'GaussianNB Best Params: ',grid_search.best_params_)
    print(f'GaussianNB Best Score: ', grid_search.best_score_)      
    return grid_search.best_estimator_

In [12]:
bayes_gaussian = best_bayes_clf(X, y, scoring='accuracy', model_name='GaussianNB')
bayes_bernoulliNB = best_bayes_clf(X, y, scoring='accuracy', model_name='BernoulliNB')

Fitting 5 folds for each of 5 candidates, totalling 25 fits
GaussianNB Best Params:  {'var_smoothing': 1e-09}
GaussianNB Best Score:  0.817048521750047
Fitting 5 folds for each of 30 candidates, totalling 150 fits
GaussianNB Best Params:  {'alpha': 0.1, 'binarize': 0.5, 'fit_prior': True}
GaussianNB Best Score:  0.8047015253279769
