# 建模

In [1]:
import pandas as pd
import joblib
import psutil
from sklearn.model_selection import GridSearchCV


In [None]:
train = pd.read_csv('../input/train_final.csv')
test = pd.read_csv('../input/test_final.csv')

X = train.drop(['Survived'], axis=1)
y = train['Survived']


def save_model(model, name):
    '''保存模型'''
    joblib.dump(model, f'../models/{name}.pkl')
    print(f'{name} is successfully saved!')
    return True


def submit(model_name : str,
           test : pd.DataFrame):
    ''' 
    保存提交（预测）的数据\n
    model_name: 模型的名称（只传入点号之前的名称）\n
    test: 需要预测的数据集
    '''
    # 载入模型
    model = joblib.load(f'../models/{model_name}.pkl')
    # 使用模型预测
    y_pred = model.predict(test)
    # 保存提交
    submission = pd.read_csv('../submission/submission.csv')
    submission['Survived'] = y_pred.astype(int)
    submission.to_csv(f'../submission/{model_name}.csv', index=None)
    print(f'{model_name} is successfully used to test!')
    return True

def memory():
    mem = psutil.virtual_memory()
    print(f"可用内存: {mem.available / 1024 / 1024:.2f} MB")
    print(f"内存使用率: {mem.percent}%")  

In [None]:
def best_xgboost_clf(X : pd.DataFrame,
                      y : pd.DataFrame, 
                      scoring='accuracy',
                      objective='binary:logistic',
                      eval_metric='logloss'):
    '''
    lightgbm参数寻优\n
    X：输入模型的特征\n
    y：输入模型的标签\n
    scoring：模型的评价标准，可取的值为 ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']\n
    return: 输出最佳的模型\n
    详细内容见文档
    '''
    xgb.config_context()
    # 定义参数网格
    param_grid = {'eta':[0.05, 0.1, 0.2],   # 学习率
                  'max_leaves':[31, 47, 63],  # 最大叶子节点数
                  'max_depth':[5, 7, 10],  # 树的最大深度
                  'subsample':[0.9],  # 每棵树使用的样本比例
                  'colsample_bytree':[0.7,0.8, 1],  # 每棵树使用的特征比例
                  'min_child_weight':[5, 10],  # 叶子节点所需的最小样本权重和 `1-10`
                  'alpha':[0, 0.5, 1],  # L1 正则化项的权重，默认0
                  'lambda':[0, 0.5, 1],}  # L2 正则化项的权重，默认1

    model = xgb.XGBClassifier(objective=objective, eval_metric=eval_metric, verbosity=2)
    
    # 使用GridSearchCV进行超参数调优
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, 
                               n_jobs=-1, cv=5)
    grid_search.fit(X, y)
    print(f'XGBoost Best Params: ',grid_search.best_params_)
    print(f'XGBoost Best Score: ', grid_search.best_score_)
    return grid_search.best_estimator_

In [58]:
xgboost = best_xgboost_clf(X, y, scoring='accuracy')

[16:16:31] INFO: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\tree\updater_prune.cc:98: tree pruning end, 26 extra nodes, 0 pruned nodes, max_depth=5
[16:16:31] INFO: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\tree\updater_prune.cc:98: tree pruning end, 22 extra nodes, 0 pruned nodes, max_depth=5
[16:16:31] INFO: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\tree\updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=5
[16:16:31] INFO: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\tree\updater_prune.cc:98: tree pruning end, 30 extra nodes, 0 pruned nodes, max_depth=5
[16:16:31] INFO: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\tree\updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=5
[16:16:31] INFO: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\tree\updater_prune.cc:98: tree pruning end, 28 extra nodes, 0 pruned nodes, max_

In [59]:
save_model(xgboost, 'xgboost')
submit('xgboost', test)

xgboost is successfully saved!
xgboost is successfully used to test!


True