In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
import config

torch.manual_seed(config.CONFIG['seed'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
df = pd.read_csv(config.CONFIG['paths']['train_with_folds'])
TARGET_COL = config.CONFIG['validation']['target_column']
N_SPLITS = config.CONFIG['validation']['n_splits']

feature_cols = [c for c in df.columns if c not in [TARGET_COL, 'fold'] and pd.api.types.is_numeric_dtype(df[c])]
n_features = len(feature_cols)
cfg_dl = config.CONFIG['models']['dl']
print('Фич', n_features, feature_cols)

Фич 288 ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'Alley_Grvl', 'Alley_Pave', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_Bnk', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'Utilities_AllPub', 'Utilities_NoSeWa', 'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LandSlope_Gtl', 'LandSlope_Mod', 'LandSlope_Sev', 'Neighborhood_Blmngtn

# датасет и train с eval


In [3]:
class HouseDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


def get_fold_data(df, fold, feature_cols, target_col):
    train_mask = df['fold'] != fold
    val_mask = df['fold'] == fold
    X_train = df.loc[train_mask, feature_cols].values.astype(np.float32)
    y_train = df.loc[train_mask, target_col].values.astype(np.float32)
    X_val = df.loc[val_mask, feature_cols].values.astype(np.float32)
    y_val = df.loc[val_mask, target_col].values.astype(np.float32)
    return X_train, y_train, X_val, y_val

In [4]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    for X_b, y_b in loader:
        X_b = X_b.to(device)
        y_b = y_b.to(device)
        optimizer.zero_grad()
        out = model(X_b)
        loss = criterion(out, y_b)
        loss.backward()
        optimizer.step()

def evaluate_rmse(model, loader, device):
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for X_b, y_b, in loader:
            out = model(X_b.to(device))
            preds.append(out.cpu().numpy())
            targets.append(y_b.numpy())

    y_true = np.concatenate(targets)
    y_pred = np.concatenate(preds)
    return np.sqrt(mean_squared_error(y_true, y_pred))

# MLP два слоя

In [5]:
class MLP2Layers(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.fc1 = nn.Linear(n_features, 64)
        self.fc2 = nn.Linear(64, 1)
        self.act = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        return x


In [6]:
def run_cv_dl(model_class, model_kwargs, df, lr=None, batch_size=None, n_epochs=None, verbose=True):
    cfg = config.CONFIG['models']['dl']
    if lr is None: lr = cfg['lr']
    if batch_size is None: batch_size = cfg['batch_size']
    if n_epochs is None: n_epochs = cfg['n_epochs']
    scores = []
    for fold in range(N_SPLITS):
        X_tr, y_tr, X_val, y_val = get_fold_data(df, fold, feature_cols, TARGET_COL)
        train_ds = HouseDataset(X_tr, y_tr)
        val_ds = HouseDataset(X_val, y_val)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=batch_size)

        model = model_class(**model_kwargs).to(device)
        criterion = nn.MSELoss()
        opt_name = cfg.get('optimizer', 'adam')
        if opt_name == 'sgd':
            optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
        elif opt_name == 'adamw':
            optimizer = optim.AdamW(model.parameters(), lr=lr)
        else:
            optimizer = optim.Adam(model.parameters(), lr=lr)

        use_scheduler = cfg.get('scheduler') == 'cosine'
        if use_scheduler:
            scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
        for epoch in range(n_epochs):
            train_epoch(model, train_loader, criterion, optimizer, device)
            if use_scheduler:
                scheduler.step()
        rmse_val = evaluate_rmse(model, val_loader, device)
        scores.append(rmse_val)
        if verbose:
            print('Fold', fold, ':', round(rmse_val, 4))
    print('Среднее RMSE:', round(np.mean(scores), 4), '+-', round(np.std(scores), 4))
    return scores 

In [7]:
scores_mlp2 = run_cv_dl(MLP2Layers, {'n_features': n_features}, df)

Fold 0 : 0.148
Fold 1 : 0.1941
Fold 2 : 0.2663
Fold 3 : 0.177
Fold 4 : 0.1893
Среднее RMSE: 0.1949 +- 0.0391


# MLP с большим числом слоёв

In [10]:
class MLPDeep(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.fc1 = nn.Linear(n_features, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc_out = nn.Linear(16, 1)
        self.act = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        x = self.act(x)
        x = self.fc3(x)
        x = self.act(x)
        x = self.fc_out(x)
        return x

In [11]:
scores_deep = run_cv_dl(MLPDeep, {'n_features': n_features}, df)

Fold 0 : 0.1318
Fold 1 : 0.1576
Fold 2 : 0.2408
Fold 3 : 0.1563
Fold 4 : 0.1288
Среднее RMSE: 0.1631 +- 0.0407


# batch norm

In [12]:
class MLPWithBN(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.fc1 = nn.Linear(n_features, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.fc_out = nn.Linear(32, 1)
        self.act = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.act(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.act(x)
        x = self.fc_out(x)
        return x


In [13]:
scores_bn = run_cv_dl(MLPWithBN, {'n_features': n_features}, df)

Fold 0 : 0.3363
Fold 1 : 0.2929
Fold 2 : 0.3256
Fold 3 : 0.3054
Fold 4 : 0.3595
Среднее RMSE: 0.3239 +- 0.0234


# Dropout

In [15]:
class MLPWithDropout(nn.Module):
    def __init__(self, n_features, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(n_features, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.fc_out = nn.Linear(32, 1)
        self.act = nn.ReLU()
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc_out(x)
        return x


In [16]:
scores_drop02 = run_cv_dl(MLPWithDropout, {'n_features': n_features, 'dropout': cfg_dl['dropout']}, df)
scores_drop05 = run_cv_dl(MLPWithDropout, {'n_features': n_features, 'dropout': 0.5}, df)

Fold 0 : 0.6342
Fold 1 : 0.6286
Fold 2 : 0.5599
Fold 3 : 0.627
Fold 4 : 0.59
Среднее RMSE: 0.608 +- 0.0287
Fold 0 : 1.6891
Fold 1 : 1.5716
Fold 2 : 1.5808
Fold 3 : 1.604
Fold 4 : 1.3741
Среднее RMSE: 1.5639 +- 0.1036


# Больше размерности

In [17]:
class MLPBig(nn.Module):
    def __init__(self, n_features, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(n_features, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc_out = nn.Linear(64, 1)
        self.act = nn.ReLU()
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc_out(x)
        return x

In [18]:
scores_big = run_cv_dl(MLPBig, {'n_features': n_features, 'dropout': cfg_dl['dropout']}, df)

Fold 0 : 0.4612
Fold 1 : 0.4383
Fold 2 : 0.4149
Fold 3 : 0.3722
Fold 4 : 0.3696
Среднее RMSE: 0.4112 +- 0.0361


# оптимизаторы

In [19]:
def run_cv_dl_optimizer(optim_name, df):
    cfg = config.CONFIG['models']['dl']
    lr = cfg['lr'] if optim_name != 'sgd' else 0.01
    scores = []
    for fold in range(N_SPLITS):
        X_tr, y_tr, X_val, y_val = get_fold_data(df, fold, feature_cols, TARGET_COL)
        train_loader = DataLoader(HouseDataset(X_tr, y_tr), batch_size=cfg['batch_size'], shuffle=True)
        val_loader = DataLoader(HouseDataset(X_val, y_val), batch_size=cfg['batch_size'])
        model = MLPWithDropout(n_features=n_features, dropout=cfg['dropout']).to(device)
        criterion = nn.MSELoss()
        if optim_name == 'adam': opt = optim.Adam(model.parameters(), lr=lr)
        elif optim_name == 'sgd': opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
        else: opt = optim.AdamW(model.parameters(), lr=lr)
        for epoch in range(cfg['n_epochs']):
            train_epoch(model, train_loader, criterion, opt, device)
        rmse_val = evaluate_rmse(model, val_loader, device)
        scores.append(rmse_val)
        print('Fold', fold, ':', round(rmse_val, 4))
    print(optim_name, 'среднее RMSE:', round(np.mean(scores), 4))
    return scores

In [20]:
scores_adam = run_cv_dl_optimizer('adam', df)
scores_sgd = run_cv_dl_optimizer('sgd', df)
scores_adamw = run_cv_dl_optimizer('adamw', df)

Fold 0 : 0.5847
Fold 1 : 0.8765
Fold 2 : 0.6256
Fold 3 : 0.5236
Fold 4 : 0.5439
adam среднее RMSE: 0.6309
Fold 0 : 0.4349
Fold 1 : 0.401
Fold 2 : 0.3752
Fold 3 : 0.4198
Fold 4 : 0.3677
sgd среднее RMSE: 0.3997
Fold 0 : 0.4985
Fold 1 : 0.7324
Fold 2 : 0.513
Fold 3 : 0.6504
Fold 4 : 0.5915
adamw среднее RMSE: 0.5972


# Scheduler

In [22]:
def run_cv_dl_scheduler(df):
    cfg = config.CONFIG['models']['dl']
    scores = []
    for fold in range(N_SPLITS):
        X_tr, y_tr, X_val, y_val = get_fold_data(df, fold, feature_cols, TARGET_COL)
        train_loader = DataLoader(HouseDataset(X_tr, y_tr), batch_size=cfg['batch_size'], shuffle=True)
        val_loader = DataLoader(HouseDataset(X_val, y_val), batch_size=cfg['batch_size'])
        model = MLPWithDropout(n_features=n_features, dropout=cfg['dropout']).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=cfg['lr'])
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg['n_epochs'])
        for epoch in range(cfg['n_epochs']):
            train_epoch(model, train_loader, criterion, optimizer, device)
            scheduler.step()
        rmse_val = evaluate_rmse(model, val_loader, device)
        scores.append(rmse_val)
        print('Fold', fold, ':', round(rmse_val, 4))
    print('Cosine среднее RMSE:', round(np.mean(scores), 4))
    return scores

In [23]:
scores_cosine = run_cv_dl_scheduler(df)

Fold 0 : 0.6069
Fold 1 : 0.5864
Fold 2 : 0.6894
Fold 3 : 0.5155
Fold 4 : 0.5269
Cosine среднее RMSE: 0.585


In [25]:
dl_results = pd.DataFrame({
    'model': ['MLP2', 'MLPDeep', 'MLP+BN', 'Dropout0.2', 'Dropout0.5', 'Big', 'Cosine'],
    'mean_rmse': [
        np.mean(scores_mlp2), np.mean(scores_deep), np.mean(scores_bn),
        np.mean(scores_drop02), np.mean(scores_drop05), np.mean(scores_big), np.mean(scores_cosine)
    ],
    'std_rmse': [
        np.std(scores_mlp2), np.std(scores_deep), np.std(scores_bn),
        np.std(scores_drop02), np.std(scores_drop05), np.std(scores_big), np.std(scores_cosine)
    ],
})
dl_results = dl_results.sort_values('mean_rmse', ascending=False)
print(dl_results)

path_dl = config.CONFIG['paths'].get('dl_results', config.CONFIG['paths']['checkpoint_dir'] / 'dl_results.csv')
dl_results[['model', 'mean_rmse', 'std_rmse']].to_csv(path_dl, index=False)
print('Сохранено:', path_dl)

        model  mean_rmse  std_rmse
4  Dropout0.5   1.563933  0.103606
3  Dropout0.2   0.607952  0.028678
6      Cosine   0.585025  0.062592
5         Big   0.411222  0.036060
2      MLP+BN   0.323941  0.023353
0        MLP2   0.194932  0.039118
1     MLPDeep   0.163067  0.040670
Сохранено: C:\newTry2\classicMLpractice\ProjectKaggle\HousePrices\checkpoints\dl_results.csv
