In [None]:
# 特征工程：方差阈值VT
import pandas as pd
import numpy as np
# 从sklearn.feature_selection中导入VarianceThreshold:
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=0.16)

df = pd.read_csv('last_data.csv')

Before_Features = df.drop(['formula','composition','ZT','Unnamed: 0'], axis =1)
'''
我们像任何其他Scikit-learn估计器一样初始化它。阈值的默认值总是0。
而且，估计器显然只对数字数据有效，如果数据中存在分类特征，估计器就会抛出错误。
所以我们需要把数字特性子集放到另一个dataframe中
'''
The_mat_num = Before_Features.select_dtypes(include='number')
print(The_mat_num.shape)

# 将估计量与数据进行拟合
transformed = vt.fit_transform(The_mat_num)
# print(transformed)
'''
直接调用fit_transform将以numpy数组的形式返回dataframe，并删除特性。
但有时，我们不希望得到那种格式的结果，因为列名将被删除。考虑选择:
'''
_ = vt.fit(The_mat_num)
mask = vt.get_support()
'''
首先，我们将估计器与数据相匹配，并调用它的get_support()方法。
对于未删除的列，它返回一个为真值的布尔类型的掩码。然后我们可以使用这个掩码来像这样划分数据:
'''
The_mat_reduced = The_mat_num.loc[:, mask]
print(The_mat_num.shape)
'''
以上特征均具有不同的中位数，四分位数和范围完全不同的分布。 我们无法将这些功能相互比较。
我们可以使用的一种方法是通过将所有特征除以均值来对其进行归一化
'''
normalized_df = The_mat_num / The_mat_num.mean()
# print(normalized_df.head())
# print(normalized_df.var())

vt = VarianceThreshold(threshold=.16)

# Fit
_ = vt.fit(normalized_df)

# Get the mask
mask = vt.get_support()

# Subset the DataFrame
te_final = The_mat_num.loc[:, mask]
te_final


In [None]:
# 特征工程：皮尔逊相关系性
import matplotlib.pyplot as plt
import seaborn as sns
def correlation(dataset, threshold):
    col_corr = set()  # 相关列的所有名称的集合
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # 我们感兴趣的是绝对值
                colname = corr_matrix.columns[i]  # 获取列的名称获取列的名称
                col_corr.add(colname)
    af_corr = dataset.drop(col_corr,axis=1)
    return af_corr


af_both = correlation(te_final, 0.64)
print('Shape sould be:', af_both.shape)

A_cor = af_both.corr()
plt.figure(figsize=(25,20))
sns.heatmap(A_cor,cmap=plt.cm.CMRmap_r,annot=False)
plt.title("Correlation",size = 20)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split,KFold
KF = KFold(n_splits = 10)
Y = df['ZT']
X_train, X_test, y_train, y_test = KF.train_test_split(af_both, Y, test_size=0.20, random_state=415)


In [None]:
from tqdm import tqdm
N_epoch = 1000
best_loss = 9999
par = tqdm(range(N_epoch))
for i in par:
    epoch_loss = 0
    epoch_loss_v = 0

    for data, label in train_loader:
        data = data.to(device)
        label = label.to(device)
        optimizer.zero_grad()
        pre = model(data)
        loss = loss_f(pre, data)
        loss.backward()
        optimizer.step()
        epoch_loss = epoch_loss + loss.item()

    with torch.no_grad():
        for data, label in valid_loader:
            data = data.to(device)
            label = label.to(device)
            pre = model(data)
            loss = loss_f(pre, data)
            epoch_loss_v = epoch_loss_v + loss.item()

    if epoch_loss_v / len(valid_loader) < best_loss:
        # print('Save Model..')
        best_loss = epoch_loss_v / len(valid_loader)
        torch.save(model.encoder.state_dict(), './result/Encoder-72.pth')
        torch.save(model.state_dict(), './result/Auto-Encoder-72.pth')

    train_loss_list.append(epoch_loss)
    valid_loss_list.append(epoch_loss_v)
    np.savetxt('./result/train_loss.txt', train_loss_list, fmt='%.5f')
    np.savetxt('./result/valid_loss.txt', valid_loss_list, fmt='%.5f')

    par.set_description_str(f'Epoch: {i + 1:02}')
    par.set_postfix_str(f'Train: {epoch_loss / len(train_loader):.5f} | Valid: {epoch_loss_v / len(valid_loader):.5f}')


In [None]:
from tqdm import tqdm
device='cpu'
output_dim=72
model.load_state_dict(torch.load('./result/Auto-Encoder-72.pth'))
model.to(device)
with torch.no_grad():

    count=0
    for data,label in tqdm(train_loader):
        data=data.to(device)
        label=label.to(device).detach().numpy()
        pred=model.encode(data)
        if count==0:
            train_data=pred
            train_label=label
        else:
            train_data=np.concatenate([train_data,pred],axis=0)
            train_label=np.concatenate([train_label,label],axis=0)
        count+=1

    count=0
    for data,label in tqdm(valid_loader):
        data=data.to(device)
        label=label.to(device).detach().numpy()
        pred=model.encode(data).detach().numpy()

        if count==0:
            valid_data=pred
            valid_label=label
        else:
            valid_data=np.concatenate([valid_data,pred],
                                      axis=0)
            valid_label=np.concatenate([valid_label,label],
                                       axis=0)
        count+=1
df_train=pd.DataFrame(train_data,
                      columns=[f'feature {i}' for i in range(1,output_dim+1)])
df_train['ZT']=train_label

df_valid=pd.DataFrame(valid_data,
                      columns=[f'feature {i}' for i in range(1,output_dim+1)])
df_valid['ZT']=valid_label

print(df_train.shape,df_valid.shape)
df_train.to_csv(f'./result/Train-Reduced-Feature-{output_dim}.csv')
df_valid.to_csv(f'./result/Test-Reduced-Feature-{output_dim}.csv')
print('Save data..')

In [None]:
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor

model_lgb = LGBMRegressor(boosting_type = "gbdt",
                      num_leaves = '128',
                      objective = 'regression',
                      max_depth = 8,
                      learning_rate = 0.03,
                      subsample_freq = 1,
                      subsample = 0.9,
                      bagging_seed = 11,
                      metric = 'mae',
                      verbosity = -1,
                      reg_alpha = 0.1,
                      colsample_bytree = 1.0,
                      n_estimators = 1000,
                      verbose = 2000)
model_lgb.fit(x_train,y_train)
pred=model_lgb.predict(x_test)
r2_score(y_test,pred)

In [None]:
# 准备
df1_0 = pd.read_csv('final_data.csv')
#, 'Unnamed: 0'  #last_data:7130    ,
df2_0 = df1_0.drop(['formula', 'composition'], axis=1)
x_col = df2_0.columns.drop(['ZT'])
X = df2_0[x_col]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 在此训练回归模型
df1=pd.read_csv('./result/Train-Reduced-Feature-72.csv')
print(df1.shape,df1.columns)
train_data=df1.drop(['ZT','Unnamed: 0'],axis=1)
df2 = pd.read_csv('./result/Test-Reduced-Feature-72.csv')
test_data = df2.drop(['ZT','Unnamed: 0'], axis=1)


model_lgb = LGBMRegressor(boosting_type="gbdt",
                              num_leaves='128',
                              objective='regression',
                              max_depth=8,
                              learning_rate=0.03,
                              subsample_freq=1,
                              subsample=0.9,
                              bagging_seed=11,
                              metric='mae',
                              verbosity=-1,
                              reg_alpha=0.1,
                              colsample_bytree=1.0,
                              n_estimators=1000,
                              verbose=2000)

model_lgb.fit(train_data, df1['ZT'])

pred = model_lgb.predict(test_data)

# model_rf = RandomForestRegressor(n_estimators=300)
# model_rf.fit(train_data, df1['ZT'])
# pred = model_rf.predict(test_data)

print(f'D{72} R2 Score:{r2_score(df2.ZT, pred ):.4f}')


model = AutoEncoder(input_dim=146,
                    output_dim=72)
model.load_state_dict(torch.load('./result/Auto-Encoder-72.pth'))
device = 'cpu'
model.to(device)
loss_f = nn.MSELoss()

In [None]:
with torch.no_grad():
    for i in tqdm(range(len(feature_scaled))):
        input_feature = feature_scaled[i].reshape(-1, 146)
        input_feature = torch.Tensor(input_feature)

        pred = model(input_feature)
        loss = loss_f(pred, input_feature).item()
        loss_list.append(loss)
        if loss > 0.0005:
            label_list.append(0)
        else:
            label_list.append(1)

        # encode 146->72 feature
        encode_feature = model.encode(input_feature).cpu().detach().numpy()

        zt_pred=model_lgb.predict(encode_feature)[0]
        zt_pred_list.append(zt_pred)