# Version

In [None]:
"""
Author: Zhang Lu
Version: 1.0.0
Date:2025-02-14
Description: Happy Valentine's Day.
"""

# Import package & define library

In [None]:
import os
from lightgbm import LGBMClassifier
import json
import importlib
import pandas as pd
import gc

## import manual functions and config

In [None]:
import Complex_Utility
importlib.reload(Complex_Utility)
from Complex_Utility import *

import Simple_Utility
importlib.reload(Simple_Utility)
from Simple_Utility import *

import config
importlib.reload(config)
from config import *

In [None]:
os.chdir(f)
os.getcwd()

In [2]:
#内存情况监控：
import psutil
mem_usage = psutil.virtual_memory()

print(f"已使用内存百分比：{mem_usage.percent}%")
print(f"已使用内存：{mem_usage.used} bytes")
print(f"总内存：{mem_usage.total} bytes")

已使用内存百分比：62.7%
已使用内存：9227943936 bytes
总内存：17179869184 bytes


# Data input, classify Vars

## input data

In [None]:
df = pd.read_pickle(data_f)

## define Subgroup and make tag

In [None]:
'''客群定义'''
sub0 = (df['y']>-1)
sub1 = sub0 & (df['tag1']==1)
sub2 = sub0 & (df['tag2']==1)
'''在同一个变量中定义子集'''
df['sub_group'] = np.select([sub0,sub1,sub2],['G0','G1','G2'],default='Empty')
'''分开变量定义子集，以防子客群互相交叠'''
df['G0'] = np.select([sub0],['G0'],default='Empty')
df['G1'] = np.select([sub1],['G1'],default='Empty')
df['G2'] = np.select([sub2],['G2'],default='Empty')

## Classify Vars

In [None]:
'''变量归类'''
KEY_list

# 除模型Y值以外的，其他验证Y标签
Y_list

del_list

'''建模数据'''
df = df.drop(KEY_list + del_list ,axis=1)

## Choose Subgroup to continue

In [None]:
# df.rename(columns={'y':'flag','event_mth':'yearmonth'},inplace=True)
# df.columns = df.columns.str.replace(r'[()\-\+\,\s]','_',regex=True)

senario1 = df[y]==0
senario2 = df[y]==1
senario3 = df[yearmonth]<'202402'

df_sample = df[(senario1 | senario2) & senario3]

In [None]:
del df
print('客群的负样本比例:{}'.format( len(np.where(df_sample[y]==1)[0])/len(df_sample[y])))

# Vars overview & split dataframe

## (Optional) statistical overview （待优化，简化）

In [None]:
# df_part_describe, part_tot_cols = val_describe_tot(df_sample, ext_list,
#                                                    output_file, y)

## Filter the vars can be used

In [None]:
# part_tot_cols = list(set(df_sample.columns) - set(Y_list))
part_tot_cols = df_sample.columns

## Discrete vars conversion

In [None]:
for col in list_dis:
    df_sample[col] = df_sample[col].astype('category')

## Train, Test, Oot split

In [None]:
vldt, vldt_x, vldt_y, train, train_x, train_y, test, test_x, test_y, ds_all = sample_select(
    df=df_sample[part_tot_cols],
    y=y,
    vldt_ym=vldt_ym,
    ym=yearmonth)

In [None]:
# save a copy of all data, for later use
ds_all.to_pickle(r'data/ds_all.pkl')
print(
len(np.where(train_y==1)[0])/len(train_y),
len(np.where(test_y==1)[0])/len(test_y),
len(np.where(vldt_y==1)[0])/len(vldt_y)
)

# Fit model

## Drop tags can't be included in model

In [None]:
'''drop ext_list vars'''
train_x_dist = train_x.drop(ext_list,axis=1)
test_x_dist = test_x.drop(ext_list,axis=1)
vldt_x_dist = vldt_x.drop(ext_list,axis=1)

## Bayesian Optimization

In [None]:
# Import the Bayesian optimization module
import BayesianOptimal
importlib.reload(BayesianOptimal)
from BayesianOptimal import run_bayesian_optimization

# Run the Bayesian optimization with the objects
model, result_df, _ = run_bayesian_optimization(train_x_dist, train_y, test_x_dist, test_y, vldt_x_dist, vldt_y)

### Load existing config from previous Bayesian

In [None]:
with open('json/best_params.json', 'r') as f:
    best_params = json.load(f)

In [None]:
# Fit the model with the training data
model.fit(train_x_dist, train_y)

# print the model verification results
train_ks, train_auc, test_ks, test_auc, vldt_ks, vldt_auc, train_test_psi, train_vldt_psi, traintest_vldt_psi\
    = rst_print(model, train_x_dist, train_y, test_x_dist, test_y, vldt_x_dist, vldt_y)

## select top vars to simplify the model

In [None]:
'''choose top Vars to re-fit model'''
df_imp = pd.DataFrame()
index = 0
for i, j in zip(train_x_dist, model.feature_importances_):
    print(i, j)
    df_imp.loc[index, 'col_name'] = i
    df_imp.loc[index, 'imp_val'] = j
    index = index + 1
df_imp = df_imp.sort_values(by=['imp_val'], ascending=False)
df_imp_val = list(df_imp.head(100).col_name)
# list_dis_new= list(set(list_dis) & set(df_imp_val))
# print(list_dis_new)

## manually drop some vars

In [None]:
df_imp_val = list(set(df_imp_val)-set(['UPPB027']))

## Fit the final model

In [None]:
# Fit the model again
model.fit(train_x_dist[df_imp_val], train_y)

# print the model verification results
train_ks, train_auc, test_ks, test_auc, vldt_ks, vldt_auc, train_test_psi, train_vldt_psi, traintest_vldt_psi\
    = rst_print(model, train_x_dist[df_imp_val], train_y, test_x_dist[df_imp_val], test_y, vldt_x_dist[df_imp_val], vldt_y)

# Save the model files

In [None]:
# Save the model files
model_auto_file(ds_all, model, model_folder, pkl_file, model_var_file, score_file)