# 建模

In [3]:
import pandas as pd
import joblib
import psutil
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

In [4]:
train = pd.read_csv('../input/train_final.csv')
test = pd.read_csv('../input/test_final.csv')

X = train.drop(['Survived'], axis=1)
y = train['Survived']


def save_model(model, name):
    '''保存模型'''
    joblib.dump(model, f'../models/{name}.pkl')
    print(f'{name} is successfully saved!')
    return True


def submit(model_name : str,
           test : pd.DataFrame):
    ''' 
    保存提交（预测）的数据\n
    model_name: 模型的名称（只传入点号之前的名称）\n
    test: 需要预测的数据集
    '''
    # 载入模型
    model = joblib.load(f'../models/{model_name}.pkl')
    # 使用模型预测
    y_pred = model.predict(test)
    # 保存提交
    submission = pd.read_csv('../submission/submission.csv')
    submission['Survived'] = y_pred.astype(int)
    submission.to_csv(f'../submission/{model_name}.csv', index=None)
    print(f'{model_name} is successfully used to test!')
    return True

def memory():
    mem = psutil.virtual_memory()
    print(f"可用内存: {mem.available / 1024 / 1024:.2f} MB")
    print(f"内存使用率: {mem.percent}%")

In [None]:
def best_lightgbm_clf(X : pd.DataFrame,
                      y : pd.DataFrame, 
                      scoring='accuracy',
                      boosting_type = 'gbdt'):
    '''
    lightgbm参数寻优\n
    X：输入模型的特征\n
    y：输入模型的标签\n
    scoring：模型的评价标准，可取的值为 ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']\n
    boosting_type:
        gbdt: 训练速度快，适用于大多数场景
        dart: 数据量小且容易过拟合的场景
        rf: 高维数据或需要强正则化的场景，防止过拟合的能力强
        
    return: 输出最佳的模型
    '''
    # 定义参数网格
    param_grid = {'num_leaves': [31, 63],  # [31, 63]
                  'max_depth': [3, 5, 7, -1],  # [3, 5, 7, -1]
                  'min_split_gain': [0, 0.1, 0.2],  # [0, 0.1, 0.2]
                  'min_child_weight': [0.001, 0.01, 0.1, 1],  # [0.001, 0.01, 0.1, 1]
                  'min_child_samples': [10, 20, 50],  # [10, 20, 50]
                  'subsample_freq': [5, 10],  # [5, 10]
                  'colsample_bytree' : [0.8, 1],  # [0.8, 1]
                  'reg_alpha': [0, 0.1],  # [0, 0.1]
                  'reg_lambda': [0, 0.1]}  # [0, 0.1]
    
    model = lgb.LGBMClassifier(boosting_type=boosting_type, 
                               learning_rate=0.01,
                               n_estimators=200,  # 1000
                               n_jobs=-1)
    
    # 使用GridSearchCV进行超参数调优
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, 
                               n_jobs=-1, cv=5)
    grid_search.fit(X, y)
    print(f'LightGBM Best Params: ',grid_search.best_params_)
    print(f'LightGBM Best Score: ', grid_search.best_score_)
    return grid_search.best_estimator_

In [11]:
lightgbm = best_lightgbm_clf(X, y, scoring='accuracy')

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288
AdaBoost Best Params:  {'num_leaves': 31, 'reg_lambda': 0.1}
AdaBoost Best Score:  0.8181846713953925


In [12]:
save_model(lightgbm, 'lightgbm')
submit('lightgbm', test)

lightgbm is successfully saved!
lightgbm is successfully used to test!


True