# 输出特征重要性

目前基本10个特征，使用lightgbm在训练数据上训练，并输出特征重要性

In [1]:
import sklearn
import pandas as pd
import numpy as np
import time
import random
import pickle

In [2]:
import lightgbm
from lightgbm import LGBMClassifier

In [3]:
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.tree import DecisionTreeClassifier

In [4]:
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score

In [5]:
import json

# 0. 读取数据

读取数据并转换为机器学习可用的矩阵（numpy格式）

**240403更新，现在机器学习部分会读取`data/settings`文件夹中的`feature.json`和`label.json`文件作为特征以及标签的字段名**

二者与读取进来的文件中相关字段应该是匹配的。

In [6]:
feature_settings_fpath = '../data/settings/feature.json'
label_settings_fpath = '../data/settings/label.json'

In [7]:
with open(feature_settings_fpath, 'r',encoding='utf-8') as file:
    feature_dict = json.load(file)

In [8]:
with open(label_settings_fpath, 'r',encoding='utf-8') as file:
    label_dict = json.load(file)

In [9]:
label_dict

{'index': [0, 1]}

In [10]:
feature_dict

{'q/qm': [0, 200],
 'ploss': [-10000, 10000],
 'fre': [110000, 150000],
 'vpa': [0, 400000],
 'papower': [0, 65000],
 'ch': [0, 8],
 'ce': [-127, 128],
 'rppower': [0, 65000],
 'ss': [0, 255],
 'eff': [0, 1.3]}

In [11]:
feature_columns = list(feature_dict.keys())

In [12]:
# 训练集数据路径
train_fpath = '../data/test_data/train_240409.xlsx'
# 测试集数据路径
test_fpath = '../data/test_data/test_240409.xlsx'

In [13]:
load_train_df = pd.read_excel(train_fpath)
load_test_df = pd.read_excel(test_fpath)

#### 将所有columns名转换为小写

In [14]:
load_train_df.columns = [i.lower() for i in load_train_df.columns]
load_test_df.columns = [i.lower() for i in load_test_df.columns]

#### 兼容之前的代码写法

In [15]:
feature_key_lst = feature_columns

feature_range_dict = feature_dict

label_key = list(label_dict.keys())[0]

### 保留原始特征副本

In [16]:
cp_train_df = load_train_df.copy()
cp_test_df = load_test_df.copy()

cp_train_df['slice'] = ['train' for i in range(cp_train_df.shape[0])]
cp_test_df['slice'] = ['test' for i in range(cp_test_df.shape[0])]

cp_merge_df = pd.concat([cp_train_df,cp_test_df])

In [17]:
merge_df = cp_merge_df.copy()

# 1. 缺失数据补全（todo）

如果存在部分列数据缺失，在这一步骤进行数据的补全以便于减少这部分缺失数据的影响。

### 1.1 补全为0

缺失数据全部用`0`补全

In [18]:
for col in merge_df.columns:
    merge_df[col] = merge_df[col].fillna(0)

# 2. 特征工程

这一步骤针对每列特征进行处理，处理手段有很多种，例如数据分桶，数据扩展，数据统计等等，此处只对数据进行归一化放缩，使得所有特征值范围均在`0,1`之间，以便于机器学习模型学习的稳定性。

后续根据问题的深入程度不同，可以将更多的特征处理手段应用在这一步骤，来增强整体机器学习系统的表现。

## 2.0 互信息分析

参考kaggle：https://www.kaggle.com/code/ryanholbrook/mutual-information

互信息衡量特征和目标值之间的关系

In [19]:
# from sklearn.feature_selection import mutual_info_regression

# def make_mi_scores(X, y, discrete_features):
#     mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
#     mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
#     mi_scores = mi_scores.sort_values(ascending=False)
#     return mi_scores

In [20]:
# X = train_df.copy()
# y = X.pop("index")

# # Label encoding for categoricals
# for colname in X.select_dtypes("object"):
#     X[colname], _ = X[colname].factorize()

# # All discrete features should now have integer dtypes (double-check this before using MI!)
# discrete_features = X.dtypes == np.int64

In [21]:
# discrete_features

In [22]:
# mi_scores = make_mi_scores(X, y, discrete_features)

### 特征互信息相关性

In [23]:
# mi_scores

## 2.1 数据归一化

根据特征值范围，将现有特征放缩到`(0,1)`之间

In [24]:
for k in feature_key_lst:
    # 数据范围上界
    ub = feature_range_dict[k][0]
    # 数据范围下界
    lb = feature_range_dict[k][1]
    
    merge_df[k] = merge_df[k].apply(lambda x:(x-lb)/(ub-lb))

## 2.2 数据取对数

以自然对数`e`为底

In [25]:
# merge_df = cp_merge_df.copy()

In [26]:
# for k in feature_key_lst:
#     merge_df[k] = merge_df[k].apply(lambda x:np.log(abs(x) if not x==0 else 1e-6))

## 2.3 特征交互除法

1. 在归一化数据基础上做
2. 两两交互
3. a除以b

In [27]:
for k1 in feature_columns:
    for k2 in feature_columns:
        if k1!=k2:
            merge_df[f'{k1}_div_{k2}'] = [a/(b if b!=0 else 1) for a,b in zip(merge_df[k1],merge_df[k2])]

In [28]:
merge_df

Unnamed: 0,index,q/qm,ploss,fre,vpa,papower,ch,ce,rppower,ss,...,ss_div_eff,eff_div_q/qm,eff_div_ploss,eff_div_fre,eff_div_vpa,eff_div_papower,eff_div_ch,eff_div_ce,eff_div_rppower,eff_div_ss
0,0,0.44,0.55855,0.1750,0.970137,0.958108,-0.000,0.533333,0.961538,0.431373,...,1.468426,0.667649,0.525943,1.678659,0.302808,0.306610,0.293765,0.550810,0.305516,0.681002
1,0,0.44,0.56235,0.1250,0.953228,0.888615,0.125,0.501961,0.902308,0.431373,...,1.325956,0.739385,0.578518,2.602635,0.341292,0.366108,2.602635,0.648117,0.360553,0.754173
2,0,0.44,0.55915,0.1250,0.940720,0.550938,0.125,0.498039,0.604969,0.431373,...,1.334186,0.734824,0.578239,2.586580,0.343697,0.586858,2.586580,0.649191,0.534445,0.749520
3,0,0.44,0.58175,0.1250,0.938763,0.449062,-0.000,0.501961,0.506231,0.431373,...,1.388881,0.705886,0.533889,2.484720,0.330850,0.691642,0.310590,0.618754,0.613534,0.720004
4,0,0.44,0.57265,0.1250,0.936492,0.320385,-0.000,0.494118,0.379554,0.431373,...,1.448820,0.676683,0.519935,2.381925,0.317932,0.929322,0.297741,0.602570,0.784449,0.690217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,1,0.56,0.51850,0.1250,0.922785,0.732969,0.250,0.494118,0.887123,0.403922,...,0.598546,1.205067,1.301519,5.398700,0.731305,0.920690,2.699350,1.365742,0.760703,1.670714
79,1,0.56,0.43890,0.1250,0.920338,0.418600,0.375,0.494118,0.597585,0.403922,...,0.863860,0.834960,1.065340,3.740622,0.508050,1.117004,1.246874,0.946288,0.782446,1.157595
80,1,0.56,0.44080,0.1250,0.919403,0.359692,0.250,0.494118,0.545877,0.403922,...,0.888831,0.811502,1.030947,3.635530,0.494279,1.263417,1.817765,0.919703,0.832498,1.125073
81,1,0.56,0.53630,0.6475,0.970137,0.953877,0.250,0.505882,0.963354,0.403922,...,1.038831,0.694327,0.725011,0.600499,0.400792,0.407624,1.555293,0.768604,0.403614,0.962621


# 3. 模型学习

使用`sklearn`工具库自带的机器学习模型对训练数据进行拟合，并在测试数据上测试，评价指标为`precision`,`recall`,`f1_score`,`accuracy`。

不同模型的调参方法不同，可以根据需要进行参数调整。

## 3.0 输出特征矩阵

输出训练特征矩阵分别为：`train_X`和`test_X`，测试标签为`train_y`和`test_y`。

In [29]:
real_feature_lst = [k for k in merge_df.columns if not k in ('index','slice')]

train_df = merge_df[merge_df.slice == 'train'].copy()
test_df = merge_df[merge_df.slice == 'test'].copy()

train_X = train_df[real_feature_lst].to_numpy()

test_X = test_df[real_feature_lst].to_numpy()

train_y = train_df[label_key].to_numpy()
test_y = test_df[label_key].to_numpy()

## 3.1 lightgbm模型


In [30]:
def lgbr_model(train_X,train_y):
    # 设置分类阈值 超过该值的分类为1
    threshold = 0.5
    
    model = LGBMClassifier(num_leaves=500,n_estimators=5000,
                       verbosity=-1)
    
    model.fit(train_X,train_y)

    pred_test_y = model.predict(test_X)
    
    pred_train_y = model.predict(train_X)
    
    ### 训练集分数
    p_score = precision_score(train_y, pred_train_y)
    r_score = recall_score(train_y, pred_train_y)
    f_score = f1_score(train_y, pred_train_y)

    ### 训练集结果
    
    print(f'Train Model:{model.__str__()}, Precision:{p_score:.4}, Recall:{r_score:.4}, F1:{f_score:.4}')

    ### 测试集分数

    p_score = precision_score(test_y, pred_test_y)
    r_score = recall_score(test_y, pred_test_y)
    f_score = f1_score(test_y, pred_test_y)

    ### 测试集结果

    print(f'Test Model:{model.__str__()}, Precision:{p_score:.4}, Recall:{r_score:.4}, F1:{f_score:.4}')
    
    return model

In [31]:
model = lgbr_model(train_X,train_y)

Train Model:LGBMClassifier(n_estimators=5000, num_leaves=500, verbosity=-1), Precision:1.0, Recall:1.0, F1:1.0
Test Model:LGBMClassifier(n_estimators=5000, num_leaves=500, verbosity=-1), Precision:0.9535, Recall:0.9535, F1:0.9535


In [33]:
model.booster_.save_model('../data/model_lgbc.txt')

<lightgbm.basic.Booster at 0x2997c5a8550>

## 3.2 输出特征重要性

In [32]:
model.feature_importances_

sorted(zip([i for i in train_df.columns[1:]], model.feature_importances_), key=lambda x:x[1] ,reverse=True)

feature_names = [i[0] for i in sorted(zip([i for i in real_feature_lst], model.feature_importances_), key=lambda x:x[1] ,reverse=False)]

# len(model.feature_importances_)

# len(feature_names)

In [33]:
# ax = lightgbm.plot_importance(booster=model,
#                     )
# ax.set_yticklabels(feature_names)

## 3.3 线性回归模型

In [34]:
def liner_model(train_X,train_y,test_X,test_y):
    # 设置分类阈值 超过该值的分类为1
    threshold = 0.5
    
    model = LinearRegression()
    
    model.fit(train_X,train_y)

    pred_test_y = model.predict(test_X)
    pred_test_y = np.array([1 if i>threshold else 0 for i in pred_test_y])
    
    pred_train_y = model.predict(train_X)
    pred_train_y = np.array([1 if i>threshold else 0 for i in pred_train_y])
    
    ### 训练集分数
    p_score = precision_score(train_y, pred_train_y)
    r_score = recall_score(train_y, pred_train_y)
    f_score = f1_score(train_y, pred_train_y)

    ### 训练集结果
    
    print(f'Train Model:{model.__str__()}, Precision:{p_score:.4}, Recall:{r_score:.4}, F1:{f_score:.4}')

    ### 测试集分数

    p_score = precision_score(test_y, pred_test_y)
    r_score = recall_score(test_y, pred_test_y)
    f_score = f1_score(test_y, pred_test_y)

    ### 测试集结果

    print(f'Test Model:{model.__str__()}, Precision:{p_score:.4}, Recall:{r_score:.4}, F1:{f_score:.4}')
    
    return model

In [35]:
liner_model(train_X,train_y,test_X,test_y)

Train Model:LinearRegression(), Precision:0.9835, Recall:0.9958, F1:0.9896
Test Model:LinearRegression(), Precision:0.8511, Recall:0.9302, F1:0.8889


## 3.3 逻辑回归模型

In [36]:
def logistc_model(train_X,train_y,test_X,test_y):
    # 设置分类阈值 超过该值的分类为1
    threshold = 0.5
    
    model = LogisticRegression(max_iter=5000)
    
    model.fit(train_X,train_y)

    pred_test_y = model.predict(test_X)
    pred_test_y = np.array([1 if i>threshold else 0 for i in pred_test_y])
    
    pred_train_y = model.predict(train_X)
    pred_train_y = np.array([1 if i>threshold else 0 for i in pred_train_y])
    
    ### 训练集分数
    p_score = precision_score(train_y, pred_train_y)
    r_score = recall_score(train_y, pred_train_y)
    f_score = f1_score(train_y, pred_train_y)

    ### 训练集结果
    
    print(f'Train Model:{model.__str__()}, Precision:{p_score:.4}, Recall:{r_score:.4}, F1:{f_score:.4}')

    ### 测试集分数

    p_score = precision_score(test_y, pred_test_y)
    r_score = recall_score(test_y, pred_test_y)
    f_score = f1_score(test_y, pred_test_y)

    ### 测试集结果

    print(f'Test Model:{model.__str__()}, Precision:{p_score:.4}, Recall:{r_score:.4}, F1:{f_score:.4}')
    
    return model

In [37]:
logistc_model(train_X,train_y,test_X,test_y)

Train Model:LogisticRegression(max_iter=5000), Precision:0.9755, Recall:0.9958, F1:0.9856
Test Model:LogisticRegression(max_iter=5000), Precision:0.9762, Recall:0.9535, F1:0.9647


## 3.4 支持向量分类

In [38]:
def svm_model(train_X,train_y,test_X,test_y):
    # 设置分类阈值 超过该值的分类为1
    threshold = 0.5
    
    model = SVC(max_iter=1_000)
    
    model.fit(train_X,train_y)

    pred_test_y = model.predict(test_X)
    pred_test_y = np.array([1 if i>threshold else 0 for i in pred_test_y])
    
    pred_train_y = model.predict(train_X)
    pred_train_y = np.array([1 if i>threshold else 0 for i in pred_train_y])
    
    ### 训练集分数
    p_score = precision_score(train_y, pred_train_y)
    r_score = recall_score(train_y, pred_train_y)
    f_score = f1_score(train_y, pred_train_y)

    ### 训练集结果
    
    print(f'Train Model:{model.__str__()}, Precision:{p_score:.4}, Recall:{r_score:.4}, F1:{f_score:.4}')

    ### 测试集分数

    p_score = precision_score(test_y, pred_test_y)
    r_score = recall_score(test_y, pred_test_y)
    f_score = f1_score(test_y, pred_test_y)

    ### 测试集结果

    print(f'Test Model:{model.__str__()}, Precision:{p_score:.4}, Recall:{r_score:.4}, F1:{f_score:.4}')
    
    return model

In [39]:
svm_model(train_X,train_y,test_X,test_y)

Train Model:SVC(max_iter=1000), Precision:0.9467, Recall:0.9625, F1:0.9545
Test Model:SVC(max_iter=1000), Precision:0.8936, Recall:0.9767, F1:0.9333
