In [108]:
import hashlib
import os
import tarfile
import zipfile
import requests

#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件，返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname
def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """下载DATA_HUB中的所有文件"""
    for name in DATA_HUB:
        download(name)
# 如果没有安装pandas，请取消下一行的注释
# !pip install pandas

%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))


In [109]:
test_data.shape

(1459, 80)

In [110]:
train_featurs = train_data.iloc[:,1:-1]
train_labels = train_data.iloc[:,-1:]
test_features = test_data.iloc[:,1:]
all_featurs = pd.concat((train_featurs,test_features))
all_featurs = pd.get_dummies(all_featurs,dummy_na=True)
all_featurs.shape

(2919, 330)

In [111]:
train_featurs = all_featurs.iloc[:train_data.shape[0],:]
test_features = all_featurs.iloc[train_data.shape[0]:,:]

In [112]:
def clean_data(x):
    y = x.apply(
    lambda x: (x - x.mean()) / (x.std())
    )
# 在标准化数据之后，所有均值消失，因此我们可以将缺失值设置为0
    y = y.fillna(0)
    return y

train_featurs = clean_data(train_featurs)
test_features = clean_data(test_features)
train_featurs.iloc[0]

MSSubClass               0.073350
LotFrontage             -0.207948
LotArea                 -0.207071
OverallQual              0.651256
OverallCond             -0.517023
                           ...   
SaleCondition_Alloca    -0.091003
SaleCondition_Family    -0.117811
SaleCondition_Normal     0.467491
SaleCondition_Partial   -0.305890
SaleCondition_nan        0.000000
Name: 0, Length: 330, dtype: float64

In [113]:
import torch
train_featurs = torch.tensor(train_featurs.values,dtype=torch.float32)
test_features = torch.tensor(test_features.values,dtype=torch.float32)
train_labels = train_labels/10000
train_labels = torch.tensor(train_labels.values,dtype=torch.float32)

In [114]:
from torch.utils.data import(
    TensorDataset,
    random_split,
    DataLoader
) 

train_dataset = TensorDataset(train_featurs,train_labels)
test_dataset = TensorDataset(test_features)
length_train_dataset = len(train_dataset)
s1 = int(0.9*length_train_dataset)
s2 = length_train_dataset - s1 

train_dataset , val_dataset = random_split(train_dataset,[s1,s2])
batch_size = 64
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)
val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

In [115]:
config = {
    'n_epoch': 1000,
    'optimer' : 'Adam',
    "val_limit_loss" : 20,
    "val_limit_epoch" : 300,
    "save_path" : "../model_house_prices/model.pth",
    'save_predict_path': "../submission.csv"
}

In [116]:
from torch.nn import (
    Linear,
    ReLU,
    Module,
    Sequential
)
class MLP(Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.mlp = Sequential(
            Linear(330,350),
            ReLU(),
            Linear(350,256),
            ReLU(),
            Linear(256,128),
            ReLU(),
            Linear(128,64),
            ReLU(),
            Linear(64,1)
        ) 
    def forward(self,x):
        return self.mlp(x)
    
def init_parter(x):
    if isinstance(x,Linear):
        torch.nn.init.kaiming_normal_(x.weight,mode='fan_in',nonlinearity='relu')
        torch.nn.init.constant_(x.bias,0)

mlp = MLP()
mlp.apply(init_parter)


MLP(
  (mlp): Sequential(
    (0): Linear(in_features=330, out_features=350, bias=True)
    (1): ReLU()
    (2): Linear(in_features=350, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): ReLU()
    (8): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [117]:
def get_device(chose_cuda_num = 0):
    return f"cuda:{chose_cuda_num}" if torch.cuda.is_available() else "cpu"

device = get_device()
device

'cuda:0'

In [118]:
lossfuncton = torch.nn.MSELoss()
optimer = getattr(torch.optim,config['optimer'])(mlp.parameters(),lr=0.0001,weight_decay=0.001)

from torch.utils.tensorboard import SummaryWriter

writer_val = SummaryWriter("../log/val")
writer_train = SummaryWriter("../log/train")

def val(model,lossfunction,val_dataloader,device):
    model.eval()
    with torch.no_grad():
        epoch = 0
        loss = 0
        for val_features , val_labels in val_dataloader:
            epoch += 1
            val_features = val_features.to(device)
            val_labels = val_labels.to(device)
            y_hat = model(val_features)
            loss += lossfunction(y_hat,val_labels).to(device)

        return loss/epoch
    

def train(config,model,lossfunction,optimer,
          writer_train,writer_val,train_dataloader,
          val_dataloader,device):
    n_epoch = config["n_epoch"]
    train_epoch = 0
    model.train()
    val_limit_loss = config["val_limit_loss"]
    val_limit_epoch = config["val_limit_epoch"]
    limit_epoch = 0 
    for epoch in range(n_epoch):
        for train_featur,train_label in train_dataloader:
            train_epoch += 1
            train_featur = train_featur.to(device)
            train_label = train_label.to(device)
            y_hat = model(train_featur)
            loss = lossfunction(y_hat,train_label).to(device)
            optimer.zero_grad()
            loss.backward()
            optimer.step()
            writer_train.add_scalar('train',loss,train_epoch)
        val_loss =val(model,lossfunction,val_dataloader,device)
        if(epoch%10==0):
            print(val_loss)
        writer_val.add_scalar('val',val_loss,epoch)

        if val_loss < val_limit_loss:
            val_limit_loss = val_loss
            
            torch.save(model.state_dict(),config['save_path'])
            
            limit_epoch = 0
        else:
            limit_epoch +=1
        
        if limit_epoch > val_limit_epoch:
            break
    print(epoch)
    writer_train.close()
    writer_val.close()
mlp = mlp.to(device)
train(config=config,
      model=mlp,
      lossfunction=lossfuncton,
      optimer=optimer,
      writer_train=writer_train,
      writer_val=writer_val,
      train_dataloader=train_dataloader,
      val_dataloader=val_dataloader,
      device=device
      )

        



tensor(268.9332, device='cuda:0')
tensor(19.1471, device='cuda:0')
tensor(14.7980, device='cuda:0')
tensor(14.6703, device='cuda:0')
tensor(14.7310, device='cuda:0')
tensor(15.5085, device='cuda:0')
tensor(15.7615, device='cuda:0')
tensor(15.7620, device='cuda:0')
tensor(16.1086, device='cuda:0')
tensor(16.4362, device='cuda:0')
tensor(16.5881, device='cuda:0')
tensor(16.6142, device='cuda:0')
tensor(16.5161, device='cuda:0')
tensor(16.5666, device='cuda:0')
tensor(16.5194, device='cuda:0')
tensor(16.6470, device='cuda:0')
tensor(16.6409, device='cuda:0')
tensor(16.6664, device='cuda:0')
tensor(16.5434, device='cuda:0')
tensor(16.6117, device='cuda:0')
tensor(16.4152, device='cuda:0')
tensor(16.3779, device='cuda:0')
tensor(16.7014, device='cuda:0')
tensor(16.3274, device='cuda:0')
tensor(16.5192, device='cuda:0')
tensor(15.9219, device='cuda:0')
tensor(16.0815, device='cuda:0')
tensor(16.1231, device='cuda:0')
tensor(16.1823, device='cuda:0')
tensor(16.2856, device='cuda:0')
tensor(16

In [119]:
#!tensorboard --logdir=./log/train

In [120]:

predict_model = MLP().to(device)
param_dict = torch.load(config['save_path'])
predict_model.load_state_dict(param_dict)

def test(model,device,test_dataloader):
    model.eval()
    with torch.no_grad():
        epoch = 0
        predict = []
        for test_features in test_dataloader:
            epoch += 1
            test_features = test_features[0].to(device)
            y_hat = model(test_features)*10000
            predict.append(y_hat)
    return predict

predict_result = test(model=predict_model,
     device=device,
     test_dataloader=test_dataloader)

result = torch.concat(predict_result,dim=0)
result.shape

def save_reslut(config,result):
    save_predict_path = config['save_predict_path']
    preds = result.to('cpu').numpy()
    ids = range(1461,2920)
    
    df = pd.DataFrame({
        'Id': ids,
        'SalePrice': preds.flatten()  # 确保preds是一维数组
    })
    df.to_csv(save_predict_path, index=False)
save_reslut(config=config,
            result=result)