# 建模

In [50]:
import pandas as pd
import joblib
from tools import data_overview
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV



from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [None]:
train = pd.read_csv('../input/train_final.csv')
test = pd.read_csv('../input/test_final.csv')

X = train.drop(['Survived'], axis=1)
y = train['Survived']

# KNN

# SVC

In [None]:
def best_svc(X : pd.DataFrame,
             y : pd.DataFrame, 
             kernel = 'rbf',
             scoring='accuracy'):
    '''
    SVC参数寻优\n
    X：输入模型的特征\n
    y：输入模型的标签\n
    kernel：选择使用哪种和函数，取值为 ['rbf', 'linear', 'poly', 'sigmoid']\n
    scoring：模型的评价标准，可取的值为 ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    '''
    # 定义参数网格
    param_grid = {'C': [0.025, 0.05, 0.1, 0.5, 1, 10],
                'kernel': [kernel],
                'degree': [2, 3, 4],
                'gamma': ['scale', 'auto'],
                'coef0': [0, 0.1, 0.5, 1]}
    model = SVC()

    # 使用GridSearchCV进行超参数调优
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, 
                               n_jobs=-1, cv=5)
    grid_search.fit(X, y)
    print(f'SVC_{kernel} Best Params: ',grid_search.best_params_)
    print(f'SVC_{kernel} Best Score: ', grid_search.best_score_)
    return grid_search.best_estimator_

# SVC
svc_rbf = best_svc(X, y, kernel='rbf' ,scoring='accuracy')
svc_linear = best_svc(X, y, kernel='linear' ,scoring='accuracy')
svc_poly = best_svc(X, y, kernel='poly' ,scoring='accuracy')
svc_sigmoid = best_svc(X, y, kernel='sigmoid' ,scoring='accuracy')

# 保存模型
joblib.dump(svc_rbf, '../models/svc_rbf.pkl')
joblib.dump(svc_linear, '../models/svc_linear.pkl')
joblib.dump(svc_poly, '../models/svc_poly.pkl')
joblib.dump(svc_sigmoid, '../models/svc_sigmoid.pkl')

SVCrbf Best Params:  {'C': 0.5, 'coef0': 0, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
SVCrbf Best Score:  0.8338773460548616
SVClinear Best Params:  {'C': 0.05, 'coef0': 0, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
SVClinear Best Score:  0.8282656455966355
SVCpoly Best Params:  {'C': 0.025, 'coef0': 1, 'degree': 4, 'gamma': 'auto', 'kernel': 'poly'}
SVCpoly Best Score:  0.8372418555018518
SVCsigmoid Best Params:  {'C': 0.1, 'coef0': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'sigmoid'}
SVCsigmoid Best Score:  0.7654321762601218


In [82]:
# 载入模型
svc = joblib.load('../models/svc.pkl')

# 使用模型预测
y_pred = svc.predict(test)

# 保存提交
submission = pd.read_csv('../submission/submission.csv')
submission['Survived'] = y_pred.astype(int)
submission.to_csv('../submission/submission_svc.csv', index=None)

In [None]:
def my_function(param1, param2, param3):
    allowed_values_param1 = [value1, value2, value3]
    allowed_values_param2 = [value4, value5, value6]
    allowed_values_param3 = [value7, value8, value9]

    if param1 not in allowed_values_param1:
        raise ValueError(f"param1 must be one of {allowed_values_param1}")
    if param2 not in allowed_values_param2:
        raise ValueError(f"param2 must be one of {allowed_values_param2}")
    if param3 not in allowed_values_param3:
        raise ValueError(f"param3 must be one of {allowed_values_param3}")

    # 函数体
    pass