# 读取数据，处理数据

In [2]:
import numpy as np
import pandas as pd


# 读取数据
train_datas = pd.read_csv('./work/train.csv')
test_datas = pd.read_csv('./work/test.csv')

In [3]:
train_datas = train_datas.loc[:, ~train_datas.columns.str.contains("^Unnamed")]

In [4]:

# 清理数据
c = train_datas.columns
print(c)
clean_c = ['category', 'description', 'ein', 'leader',
           'motto', 'name', 'state', 'subcategory', 'size']
keys = []
for i in c:
    if i not in clean_c:
        keys.append(i)
print(keys)

train_datas = train_datas[keys]
for key in keys:
        train_datas[key] = pd.to_numeric(train_datas[key], errors='coerce').fillna(0)
        train_datas[key] = train_datas[key].replace(0, train_datas[key].median())

train_datas.drop(labels=[1184, 4476], inplace=True)
count = 0
for score in train_datas['score'].values:
    if score > 100 or score <= 0:
        print(score, count)
    count += 1
keys.remove('score')
print(keys)
test_datas = test_datas[keys]
for key in keys:
    test_datas[key] = pd.to_numeric(test_datas[key], errors='coerce').fillna(0)
    test_datas[key] = test_datas[key].replace(0, test_datas[key].median())
print(train_datas.shape, test_datas.shape)
print(train_datas.corr())

Index(['ascore', 'category', 'description', 'ein', 'tot_exp', 'admin_exp_p',
       'fund_eff', 'fund_exp_p', 'program_exp_p', 'fscore', 'leader',
       'leader_comp', 'leader_comp_p', 'motto', 'name', 'tot_rev', 'score',
       'state', 'subcategory', 'size', 'program_exp', 'fund_exp', 'admin_exp'],
      dtype='object')
['ascore', 'tot_exp', 'admin_exp_p', 'fund_eff', 'fund_exp_p', 'program_exp_p', 'fscore', 'leader_comp', 'leader_comp_p', 'tot_rev', 'score', 'program_exp', 'fund_exp', 'admin_exp']
['ascore', 'tot_exp', 'admin_exp_p', 'fund_eff', 'fund_exp_p', 'program_exp_p', 'fscore', 'leader_comp', 'leader_comp_p', 'tot_rev', 'program_exp', 'fund_exp', 'admin_exp']
(7398, 14) (1000, 13)
                 ascore   tot_exp  admin_exp_p      fund_eff  fund_exp_p  \
ascore         1.000000  0.083334    -0.190062  6.322378e-03   -0.001976   
tot_exp        0.083334  1.000000    -0.087523 -1.891372e-03   -0.001891   
admin_exp_p   -0.190062 -0.087523     1.000000 -1.995143e-03   -0.0019

In [5]:
import numpy as np

# 标签特征分离
train_labels = train_datas[['score']].values.astype('float32')
train_dataset = train_datas[keys].values.astype('float32')
split = int(train_dataset.shape[0]*0.8)
seed = np.random.randint(10000)
np.random.seed(seed)
np.random.shuffle(train_labels)
np.random.seed(seed)
np.random.shuffle(train_dataset)
val_dataset = train_dataset[split:, :]
val_labels = train_labels[split:, :]
train_dataset = train_dataset[:split, :]
train_labels = train_labels[:split, :]
test_dataset = test_datas.values.astype('float32')
print(train_labels.shape, train_dataset.shape, test_dataset.shape, val_dataset.shape, val_labels.shape)

# 数据标准化
feature_mean = train_dataset.mean()
feature_std = train_dataset.std()
label_mean = train_labels.mean()
label_std = train_labels.std()

train_dataset = (train_dataset - feature_mean) / feature_std
test_dataset = (test_dataset - feature_mean) / feature_std
val_dataset = (val_dataset - feature_mean) / feature_std
train_labels = (train_labels - label_mean) / label_std
val_labels = (val_labels - label_mean) / label_std

(5918, 1) (5918, 13) (1000, 13) (1480, 13) (1480, 1)


# 网络构建

In [9]:
import paddle
import paddle.nn as nn
import numpy as np



model = nn.Sequential(
    nn.Linear(in_features=train_dataset.shape[-1], out_features=16),
    nn.BatchNorm1D(num_features=16),
    nn.ReLU(),
    nn.Linear(in_features=16, out_features=8),
    nn.BatchNorm1D(num_features=8),
    nn.ReLU(),    
    nn.Linear(in_features=8, out_features=1)
)

optimizer = paddle.optimizer.Adam(learning_rate=1e-2, parameters=model.parameters())
loss = nn.MSELoss()

# 开始训练

In [10]:
import numpy as np


epochs = 100
batch_size = 32

for epoch in range(epochs):
    # 随机打乱训练集
    seed = np.random.randint(10000)
    np.random.seed(seed)
    np.random.shuffle(train_dataset)
    np.random.seed(seed)
    np.random.shuffle(train_labels)
    # 训练
    model.train()
    train_loss = 0
    for i in range(0, train_dataset.shape[0], batch_size):
        batch_datas = paddle.to_tensor(train_dataset[i:batch_size+i].copy())
        batch_labels = paddle.to_tensor(train_labels[i:batch_size+i].copy())
        # print(batch_datas.shape, batch_labels.shape)
        # 前向传播
        preds = model(batch_datas)
        # 计算损失
        step_loss = loss(preds, batch_labels)
        train_loss += step_loss.numpy()[0]
        # 反向传播
        step_loss.backward()
        # 跟新参数
        optimizer.step()
        optimizer.clear_grad()
    print('Epochs/Epoch:{}/{} Train MSELoss:{}'.format(epochs, epoch+1, train_loss/(train_dataset.shape[0]//batch_size)))
    # 验证
    model.eval()
    val_loss = 0
    for i in range(val_dataset.shape[0]):
        batch_datas = paddle.to_tensor(val_dataset[i:1+i].copy())
        batch_labels = paddle.to_tensor(val_labels[i:1+i].copy())
        # print(batch_datas.shape, batch_labels.shape)
        # 前向传播
        preds = model(batch_datas)
        # 计算损失
        val_loss += paddle.nn.functional.mse_loss(preds * label_std + label_mean, batch_labels* label_std + label_mean, reduction='sum').numpy()[0]
    print('Epochs/Epoch:{}/{} Val MSELoss:{}'.format(epochs, epoch+1, val_loss/val_dataset.shape[0]))    

  and should_run_async(code)


Epochs/Epoch:100/1 Train MSELoss:0.8662928736404233
Epochs/Epoch:100/1 Val MSELoss:79.36422948530137
Epochs/Epoch:100/2 Train MSELoss:0.7967641909161339
Epochs/Epoch:100/2 Val MSELoss:76.23801385132148
Epochs/Epoch:100/3 Train MSELoss:0.7956313485198695
Epochs/Epoch:100/3 Val MSELoss:81.6086545732226
Epochs/Epoch:100/4 Train MSELoss:0.8068522785830757
Epochs/Epoch:100/4 Val MSELoss:89.96068360010199
Epochs/Epoch:100/5 Train MSELoss:0.7983816389156424
Epochs/Epoch:100/5 Val MSELoss:84.04374363167865
Epochs/Epoch:100/6 Train MSELoss:0.7625259291380644
Epochs/Epoch:100/6 Val MSELoss:82.00356047158401
Epochs/Epoch:100/7 Train MSELoss:0.7637538367315478
Epochs/Epoch:100/7 Val MSELoss:72.25186948505426
Epochs/Epoch:100/8 Train MSELoss:0.7346142600088016
Epochs/Epoch:100/8 Val MSELoss:81.02978925035461
Epochs/Epoch:100/9 Train MSELoss:0.7167169481677853
Epochs/Epoch:100/9 Val MSELoss:82.848848801964
Epochs/Epoch:100/10 Train MSELoss:0.747574650965955
Epochs/Epoch:100/10 Val MSELoss:79.3640950

Epochs/Epoch:100/81 Train MSELoss:0.6874668773101724
Epochs/Epoch:100/81 Val MSELoss:79.49376222807844
Epochs/Epoch:100/82 Train MSELoss:0.6869427356868982
Epochs/Epoch:100/82 Val MSELoss:78.72069955448065
Epochs/Epoch:100/83 Train MSELoss:0.6938327928276166
Epochs/Epoch:100/83 Val MSELoss:92.8211955845559
Epochs/Epoch:100/84 Train MSELoss:0.6947219133701014
Epochs/Epoch:100/84 Val MSELoss:90.90867554366815
Epochs/Epoch:100/85 Train MSELoss:0.690217441190844
Epochs/Epoch:100/85 Val MSELoss:78.67467534675347
Epochs/Epoch:100/86 Train MSELoss:0.6977795566553655
Epochs/Epoch:100/86 Val MSELoss:83.5162721991104
Epochs/Epoch:100/87 Train MSELoss:0.6903359904561354
Epochs/Epoch:100/87 Val MSELoss:83.27617285591724
Epochs/Epoch:100/88 Train MSELoss:0.6908829076944486
Epochs/Epoch:100/88 Val MSELoss:80.55882895569363
Epochs/Epoch:100/89 Train MSELoss:0.683044639132593
Epochs/Epoch:100/89 Val MSELoss:78.55442195742688
Epochs/Epoch:100/90 Train MSELoss:0.671990423746731
Epochs/Epoch:100/90 Val M

# 预测结果

In [12]:
import pandas as pd
from tqdm import tqdm

scores = []
batch_size = 1
model.eval()
for i in tqdm(range(test_dataset.shape[0])):
    batch_datas = paddle.to_tensor(test_dataset[i:batch_size+i].copy())
    # 前向传播
    preds = model(batch_datas)
    scores.append(preds.squeeze().numpy() * label_std + label_mean)

test_datas = pd.read_csv('./work/test.csv')
eins = test_datas['ein'].values.tolist()

results = pd.DataFrame({'ein':eins, 'score':scores})
results.to_csv('./work/results.csv', index=None)

100%|█████████████████████████████████████████████| 1000/1000 [00:01<00:00, 545.06it/s]
