# Feature Extraction Notebook

Create Ready-to-use Sklearn Transformers

## Table of Contents:
1. [Imports](#imports)
1. [Constants](#constants)
1. [Geo Level Feature Extraction](#geo-level-feature-extraction)  
    1. [Geo Level ID: Dimensionality Reduction](#geo-dim-reduction)  
        1. [Model and Dataset Definition](#geo-dim-reduction-model-def)
        1. [Training](#geo-dim-reduction-train)
    1. [Geo Level ID: Guess Geo 3 Roll up to Geo 1 and 2](#geo3-rollup)
        1. [Model and Dataset Definition](#geo-rollup-model-def)
        1. [Training](#geo-rollup-train)
1. [All Features](#all-feature-extraction)
    1. [Simple Autoencoder](#all-feature-ae)

## Imports <a class="anchor" id="imports"></a>

In [5]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
from typing import Tuple, Any
from tqdm import tqdm
from pathlib import Path
from os import PathLike

## Constants <a id="constants"></a>

In [2]:
TRAINING_FEATURES_PATH="D:/ml_competitions/gorkha_earthquake/data/raw/train_values.csv"
TRAINING_LABELS_PATH="D:/ml_competitions/gorkha_earthquake/data/raw/train_labels.csv"
TEST_FEATURES_PATH="D:/ml_competitions/gorkha_earthquake/data/raw/test_values.csv"
SUBMISSION_FORMAT_PATH="D:/ml_competitions/gorkha_earthquake/data/raw/submission_format.csv"

SUBMISSION_DIR="D:/ml_competitions/gorkha_earthquake/submissions"

## Load Data

In [3]:
features_df         = pd.read_csv(TRAINING_FEATURES_PATH,   index_col=0)
labels_df           = pd.read_csv(TRAINING_LABELS_PATH,     index_col=0) - 1
test_features_df    = pd.read_csv(TEST_FEATURES_PATH,       index_col=0)

## Geo Level Feature Extraction <a class="anchor" id="geo-level-feature-extraction"></a>

Label Encode <a id="geo-dim-reduction-le"></a>

In [8]:
import pickle
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

df = pd.concat([features_df, test_features_df])

le1 = LabelEncoder().fit(df['geo_level_1_id'])
le2 = LabelEncoder().fit(df['geo_level_2_id'])
le3 = LabelEncoder().fit(df['geo_level_3_id'])

eval = le3.transform(test_features_df['geo_level_3_id'])

# Save All Label Encoders
with open(Path.cwd().parent / 'models' / 'geo-lv-1-label-encoder.pickle', 'wb') as f:
    pickle.dump(le1, f)
with open(Path.cwd().parent / 'models' / 'geo-lv-2-label-encoder.pickle', 'wb') as f:
    pickle.dump(le2, f)
with open(Path.cwd().parent / 'models' / 'geo-lv-3-label-encoder.pickle', 'wb') as f:
    pickle.dump(le3, f)

del le1, le2, le3

# Load All Label Encoders
with open(Path.cwd().parent / 'models' / 'geo-lv-1-label-encoder.pickle', 'rb') as f:
    le1 = pickle.load(f)
with open(Path.cwd().parent / 'models' / 'geo-lv-2-label-encoder.pickle', 'rb') as f:
    le2 = pickle.load(f)
with open(Path.cwd().parent / 'models' / 'geo-lv-3-label-encoder.pickle', 'rb') as f:
    le3 = pickle.load(f)

# Prepare Transformers
geo_lv1_le = FunctionTransformer(
    func=lambda x: np.array(le1.transform(x.values.ravel())).reshape(-1, 1),
    feature_names_out='one-to-one'
)

geo_lv2_le = FunctionTransformer(
    func=lambda x: np.array(le2.transform(x.values.ravel())).reshape(-1, 1), 
    feature_names_out='one-to-one'
)

geo_lv3_le = FunctionTransformer(
    func=lambda x: np.array(le3.transform(x.values.ravel())).reshape(-1, 1), 
    feature_names_out='one-to-one'
)

In [9]:
# Demonstration on how to use the above in a column transformer
preprocessor = ColumnTransformer([
    ('geo1_le', geo_lv1_le, ['geo_level_1_id']),
    ('geo2_le', geo_lv2_le, ['geo_level_2_id']),
    ('geo3_le', geo_lv3_le, ['geo_level_3_id']),
], remainder='drop', verbose_feature_names_out=False)

preprocessor.set_output(transform='pandas')

pipeline = Pipeline([
    ('prep', preprocessor),
])

In [10]:
df = pd.concat([features_df, test_features_df])
df: pd.DataFrame = preprocessor.fit_transform(df) # type: ignore

# Get max ID in Geo Level 1
max_geo_lv1_id = df['geo_level_1_id'].max() + 1
# Get max ID in Geo Level 2
max_geo_lv2_id = df['geo_level_2_id'].max() + 1
# Get max ID in Geo Level 3
max_geo_lv3_id = df['geo_level_3_id'].max() + 1

print(
    max_geo_lv1_id,
    max_geo_lv2_id,
    max_geo_lv3_id
)

31 1418 11861


### Geo Level ID: Dimensionality Reduction <a class="anchor" id="geo-dim-reduction"></a>

In [7]:
geo_lv1_counts = df['geo_level_1_id'].value_counts(normalize=True)
geo_lv1_weights = np.zeros(max_geo_lv1_id)
geo_lv1_weights[geo_lv1_counts.index] = 1.0 / geo_lv1_counts.to_numpy()

geo_lv2_counts = df['geo_level_2_id'].value_counts(normalize=True)
geo_lv2_weights = np.zeros(max_geo_lv2_id)
geo_lv2_weights[geo_lv2_counts.index] = 1.0 / geo_lv2_counts.to_numpy()

geo_lv3_counts = df['geo_level_3_id'].value_counts(normalize=True)
geo_lv3_weights = np.zeros(max_geo_lv3_id)
geo_lv3_weights[geo_lv3_counts.index] = 1.0 / geo_lv3_counts.to_numpy()

Define Dataset and Model <a id="geo-dim-reduction-model-def"></a>

In [8]:
# Create dataset for Autoencoder training
dataset = TensorDataset(
    torch.from_numpy(
        (
            df[['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']]
                .to_numpy()
        )
    ).type(torch.long),

    torch.from_numpy(
        (
            df[['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']]
                .to_numpy()
        )
    ).type(torch.long)
)


class DREncoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1428,
                 geo_lv3_size: int=12568) -> None:
        super().__init__()
        self.geo_lv1_embedder = torch.nn.Embedding(geo_lv1_size, 16)
        self.geo_lv2_embedder = torch.nn.Embedding(geo_lv2_size, 128)
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128) 
        self.compressor = torch.nn.Linear(16+128+128, latent_dim)

    def forward(self, x):
        x_1 = self.geo_lv1_embedder(x[:, 0])
        x_2 = self.geo_lv2_embedder(x[:, 1])
        x_3 = self.geo_lv3_embedder(x[:, 2])
        x = torch.concat((x_1, x_2, x_3), dim=1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class DRDecoder(torch.nn.Module):
    
    def __init__(self,                  
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1428,
                 geo_lv3_size: int=12568) -> None:
        super().__init__()
        self.geo_lv1_predictor = torch.nn.Linear(latent_dim, geo_lv1_size)
        self.geo_lv2_predictor = torch.nn.Linear(latent_dim, geo_lv2_size)
        self.geo_lv3_predictor = torch.nn.Linear(latent_dim, geo_lv3_size)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        x1 = self.geo_lv1_predictor(x)
        x2 = self.geo_lv2_predictor(x)
        x3 = self.geo_lv3_predictor(x)
        return x1, x2, x3


class DRAutoEncoder(torch.nn.Module):
    
    def __init__(self,
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1428,
                 geo_lv3_size: int=12568) -> None:
        super().__init__()
        self.encoder = DREncoder(latent_dim, geo_lv1_size, geo_lv2_size, geo_lv3_size)
        self.decoder = DRDecoder(latent_dim, geo_lv1_size, geo_lv2_size, geo_lv3_size)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        x = self.encoder(x)
        x = torch.nn.functional.relu(x)
        x1, x2, x3 = self.decoder(x)
        return x1, x2, x3

Training <a id="geo-dim-reduction-train"></a>

In [14]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

num_epochs = 100
dataloader = DataLoader(dataset, 2*1024)
model = DRAutoEncoder(
    latent_dim=32, 
    geo_lv1_size=max_geo_lv1_id, 
    geo_lv2_size=max_geo_lv2_id, 
    geo_lv3_size=max_geo_lv3_id
).to(DEVICE)

has_weights = True
criterion_geo_lv1 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv1_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)
criterion_geo_lv2 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv2_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)
criterion_geo_lv3 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv3_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    print(f"EPOCH {epoch+1}")
    model.train()
    training_loss = 0.0
    for x, y in tqdm(dataloader, desc="training"):
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        a = model(x)

        loss: torch.Tensor = (
            criterion_geo_lv1(a[0], y[:, 0])
            + criterion_geo_lv2(a[1], y[:, 1])
            + criterion_geo_lv3(a[2], y[:, 2])
        ) / 3

        training_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"{training_loss = }")

    # model.eval()
    # validation_loss = 0.0
    # for x, y in tqdm(dataloader, desc="validation"):
    #     with torch.no_grad():
    #         x = x.to(DEVICE)
    #         y = y.to(DEVICE)
    #         a = model(x)

    #         loss: torch.Tensor = (
    #             criterion_geo_lv1(a[0], y[:, 0])
    #             + criterion_geo_lv2(a[1], y[:, 1])
    #             + criterion_geo_lv3(a[2], y[:, 2])
    #         ) / 3

    #         validation_loss += loss.item()

    # print(f"{validation_loss = }")


EPOCH 1


training: 100%|██████████| 170/170 [00:05<00:00, 29.73it/s]


training_loss = 1009.220043182373
EPOCH 2


training: 100%|██████████| 170/170 [00:04<00:00, 36.45it/s]


training_loss = 471.63681995868683
EPOCH 3


training: 100%|██████████| 170/170 [00:04<00:00, 35.42it/s]


training_loss = 175.67798310518265
EPOCH 4


training: 100%|██████████| 170/170 [00:04<00:00, 35.66it/s]


training_loss = 87.56008893251419
EPOCH 5


training: 100%|██████████| 170/170 [00:04<00:00, 35.45it/s]


training_loss = 52.522418946027756
EPOCH 6


training: 100%|██████████| 170/170 [00:04<00:00, 34.95it/s]


training_loss = 34.71474324166775
EPOCH 7


training: 100%|██████████| 170/170 [00:04<00:00, 34.56it/s]


training_loss = 24.0804251357913
EPOCH 8


training: 100%|██████████| 170/170 [00:04<00:00, 34.31it/s]


training_loss = 17.109615106135607
EPOCH 9


training: 100%|██████████| 170/170 [00:05<00:00, 33.46it/s]


training_loss = 12.293310903012753
EPOCH 10


training: 100%|██████████| 170/170 [00:05<00:00, 32.63it/s]


training_loss = 8.903135424479842
EPOCH 11


training: 100%|██████████| 170/170 [00:05<00:00, 33.57it/s]


training_loss = 6.4613316506147385
EPOCH 12


training: 100%|██████████| 170/170 [00:05<00:00, 33.21it/s]


training_loss = 4.715239710174501
EPOCH 13


training: 100%|██████████| 170/170 [00:05<00:00, 33.15it/s]


training_loss = 3.4646494518965483
EPOCH 14


training: 100%|██████████| 170/170 [00:05<00:00, 33.46it/s]


training_loss = 2.575441953726113
EPOCH 15


training: 100%|██████████| 170/170 [00:05<00:00, 32.99it/s]


training_loss = 1.9502993351779878
EPOCH 16


training: 100%|██████████| 170/170 [00:05<00:00, 32.92it/s]


training_loss = 1.5091202021576464
EPOCH 17


training: 100%|██████████| 170/170 [00:05<00:00, 32.96it/s]


training_loss = 1.195266437716782
EPOCH 18


training: 100%|██████████| 170/170 [00:05<00:00, 33.38it/s]


training_loss = 0.9682631134055555
EPOCH 19


training: 100%|██████████| 170/170 [00:05<00:00, 32.99it/s]


training_loss = 0.8012501434423029
EPOCH 20


training: 100%|██████████| 170/170 [00:05<00:00, 32.99it/s]


training_loss = 0.6746820460539311
EPOCH 21


training: 100%|██████████| 170/170 [00:05<00:00, 32.95it/s]


training_loss = 0.5752870920114219
EPOCH 22


training: 100%|██████████| 170/170 [00:05<00:00, 33.13it/s]


training_loss = 0.4946646239841357
EPOCH 23


training: 100%|██████████| 170/170 [00:05<00:00, 33.27it/s]


training_loss = 0.42865235393401235
EPOCH 24


training: 100%|██████████| 170/170 [00:05<00:00, 33.01it/s]


training_loss = 0.3738059946335852
EPOCH 25


training: 100%|██████████| 170/170 [00:05<00:00, 32.66it/s]


training_loss = 0.3278069325024262
EPOCH 26


training: 100%|██████████| 170/170 [00:05<00:00, 32.73it/s]


training_loss = 0.28853333718143404
EPOCH 27


training: 100%|██████████| 170/170 [00:05<00:00, 33.08it/s]


training_loss = 0.2548138990532607
EPOCH 28


training: 100%|██████████| 170/170 [00:05<00:00, 32.56it/s]


training_loss = 0.22588518384145573
EPOCH 29


training: 100%|██████████| 170/170 [00:05<00:00, 32.64it/s]


training_loss = 0.2006844711722806
EPOCH 30


training: 100%|██████████| 170/170 [00:05<00:00, 32.76it/s]


training_loss = 0.17886583151994273
EPOCH 31


training: 100%|██████████| 170/170 [00:05<00:00, 33.11it/s]


training_loss = 0.1596461384324357
EPOCH 32


training: 100%|██████████| 170/170 [00:05<00:00, 32.65it/s]


training_loss = 0.14275315369013697
EPOCH 33


training: 100%|██████████| 170/170 [00:05<00:00, 33.01it/s]


training_loss = 0.12780896908952855
EPOCH 34


training: 100%|██████████| 170/170 [00:05<00:00, 32.51it/s]


training_loss = 0.11470818798989058
EPOCH 35


training: 100%|██████████| 170/170 [00:05<00:00, 32.44it/s]


training_loss = 0.10301163233816624
EPOCH 36


training: 100%|██████████| 170/170 [00:05<00:00, 32.96it/s]


training_loss = 0.09261371890897863
EPOCH 37


training: 100%|██████████| 170/170 [00:05<00:00, 32.44it/s]


training_loss = 0.08335442843963392
EPOCH 38


training: 100%|██████████| 170/170 [00:05<00:00, 32.41it/s]


training_loss = 0.07511483211419545
EPOCH 39


training: 100%|██████████| 170/170 [00:05<00:00, 32.45it/s]


training_loss = 0.06772981706308201
EPOCH 40


training: 100%|██████████| 170/170 [00:05<00:00, 32.52it/s]


training_loss = 0.061096553952666
EPOCH 41


training: 100%|██████████| 170/170 [00:05<00:00, 32.65it/s]


training_loss = 0.055187417747220024
EPOCH 42


training: 100%|██████████| 170/170 [00:05<00:00, 32.22it/s]


training_loss = 0.04987351676390972
EPOCH 43


training: 100%|██████████| 170/170 [00:05<00:00, 32.32it/s]


training_loss = 0.04509877959208097
EPOCH 44


training: 100%|██████████| 170/170 [00:05<00:00, 31.41it/s]


training_loss = 0.04077521890576463
EPOCH 45


training: 100%|██████████| 170/170 [00:05<00:00, 31.35it/s]


training_loss = 0.03691377614450175
EPOCH 46


training: 100%|██████████| 170/170 [00:05<00:00, 31.90it/s]


training_loss = 0.03342779832019005
EPOCH 47


training: 100%|██████████| 170/170 [00:05<00:00, 32.33it/s]


training_loss = 0.030276326026069
EPOCH 48


training: 100%|██████████| 170/170 [00:05<00:00, 31.99it/s]


training_loss = 0.02743553993786918
EPOCH 49


training: 100%|██████████| 170/170 [00:05<00:00, 32.56it/s]


training_loss = 0.02487599888991099
EPOCH 50


training: 100%|██████████| 170/170 [00:05<00:00, 31.89it/s]


training_loss = 0.02255609638086753
EPOCH 51


training: 100%|██████████| 170/170 [00:05<00:00, 31.76it/s]


training_loss = 0.020459497020056006
EPOCH 52


training: 100%|██████████| 170/170 [00:05<00:00, 31.88it/s]


training_loss = 0.018564941114163958
EPOCH 53


training: 100%|██████████| 170/170 [00:05<00:00, 32.05it/s]


training_loss = 0.016852508262672927
EPOCH 54


training: 100%|██████████| 170/170 [00:05<00:00, 31.57it/s]


training_loss = 0.01529739101533778
EPOCH 55


training: 100%|██████████| 170/170 [00:05<00:00, 31.76it/s]


training_loss = 0.01389221244608052
EPOCH 56


training: 100%|██████████| 170/170 [00:05<00:00, 31.95it/s]


training_loss = 0.012619474444363732
EPOCH 57


training: 100%|██████████| 170/170 [00:05<00:00, 30.61it/s]


training_loss = 0.011465052877611015
EPOCH 58


training: 100%|██████████| 170/170 [00:05<00:00, 31.16it/s]


training_loss = 0.010417593577585649
EPOCH 59


training: 100%|██████████| 170/170 [00:05<00:00, 31.91it/s]


training_loss = 0.009467580919590546
EPOCH 60


training: 100%|██████████| 170/170 [00:05<00:00, 30.82it/s]


training_loss = 0.008610237317043357
EPOCH 61


training: 100%|██████████| 170/170 [00:05<00:00, 31.81it/s]


training_loss = 0.007826751065294957
EPOCH 62


training: 100%|██████████| 170/170 [00:05<00:00, 31.72it/s]


training_loss = 0.0071173266260302626
EPOCH 63


training: 100%|██████████| 170/170 [00:05<00:00, 32.12it/s]


training_loss = 0.006474214109402965
EPOCH 64


training: 100%|██████████| 170/170 [00:05<00:00, 31.31it/s]


training_loss = 0.005890588645343087
EPOCH 65


training: 100%|██████████| 170/170 [00:05<00:00, 31.55it/s]


training_loss = 0.0053607200425176416
EPOCH 66


training: 100%|██████████| 170/170 [00:05<00:00, 31.85it/s]


training_loss = 0.004876502625847934
EPOCH 67


training: 100%|██████████| 170/170 [00:05<00:00, 31.72it/s]


training_loss = 0.0044400234564818675
EPOCH 68


training: 100%|██████████| 170/170 [00:05<00:00, 30.73it/s]


training_loss = 0.0040418150965706445
EPOCH 69


training: 100%|██████████| 170/170 [00:05<00:00, 31.55it/s]


training_loss = 0.0036806815332965925
EPOCH 70


training: 100%|██████████| 170/170 [00:05<00:00, 31.44it/s]


training_loss = 0.0033520004608362797
EPOCH 71


training: 100%|██████████| 170/170 [00:05<00:00, 31.77it/s]


training_loss = 0.003052021240364411
EPOCH 72


training: 100%|██████████| 170/170 [00:05<00:00, 31.94it/s]


training_loss = 0.002780936505587306
EPOCH 73


training: 100%|██████████| 170/170 [00:05<00:00, 31.36it/s]


training_loss = 0.0025337560427942662
EPOCH 74


training: 100%|██████████| 170/170 [00:05<00:00, 31.16it/s]


training_loss = 0.002309361717379943
EPOCH 75


training: 100%|██████████| 170/170 [00:05<00:00, 31.02it/s]


training_loss = 0.0021039712792116916
EPOCH 76


training: 100%|██████████| 170/170 [00:05<00:00, 31.56it/s]


training_loss = 0.0019178088905391633
EPOCH 77


training: 100%|██████████| 170/170 [00:05<00:00, 31.44it/s]


training_loss = 0.00174877595190992
EPOCH 78


training: 100%|██████████| 170/170 [00:05<00:00, 30.84it/s]


training_loss = 0.0015945194454616285
EPOCH 79


training: 100%|██████████| 170/170 [00:05<00:00, 31.10it/s]


training_loss = 0.0014542747076120577
EPOCH 80


training: 100%|██████████| 170/170 [00:05<00:00, 31.39it/s]


training_loss = 0.001326191899352125
EPOCH 81


training: 100%|██████████| 170/170 [00:05<00:00, 30.71it/s]


training_loss = 0.0012097935318706732
EPOCH 82


training: 100%|██████████| 170/170 [00:05<00:00, 29.56it/s]


training_loss = 0.0011038468555852887
EPOCH 83


training: 100%|██████████| 170/170 [00:05<00:00, 30.40it/s]


training_loss = 0.001007379939665043
EPOCH 84


training: 100%|██████████| 170/170 [00:05<00:00, 31.35it/s]


training_loss = 0.0009192931256620795
EPOCH 85


training: 100%|██████████| 170/170 [00:05<00:00, 31.30it/s]


training_loss = 0.0008392514450861199
EPOCH 86


training: 100%|██████████| 170/170 [00:05<00:00, 30.58it/s]


training_loss = 0.0007662023820103059
EPOCH 87


training: 100%|██████████| 170/170 [00:05<00:00, 30.98it/s]


training_loss = 0.0006995101769007306
EPOCH 88


training: 100%|██████████| 170/170 [00:05<00:00, 31.18it/s]


training_loss = 0.0006388336678355699
EPOCH 89


training: 100%|██████████| 170/170 [00:05<00:00, 31.86it/s]


training_loss = 0.0005836736979745183
EPOCH 90


training: 100%|██████████| 170/170 [00:05<00:00, 31.00it/s]


training_loss = 0.000533111090589955
EPOCH 91


training: 100%|██████████| 170/170 [00:05<00:00, 31.12it/s]


training_loss = 0.00048710413909702766
EPOCH 92


training: 100%|██████████| 170/170 [00:05<00:00, 31.09it/s]


training_loss = 0.0004452380155726132
EPOCH 93


training: 100%|██████████| 170/170 [00:05<00:00, 31.23it/s]


training_loss = 0.0004067463228238921
EPOCH 94


training: 100%|██████████| 170/170 [00:05<00:00, 30.94it/s]


training_loss = 0.00037185159180808114
EPOCH 95


training: 100%|██████████| 170/170 [00:05<00:00, 30.82it/s]


training_loss = 0.0003399154820726835
EPOCH 96


training: 100%|██████████| 170/170 [00:05<00:00, 30.59it/s]


training_loss = 0.0003107412236431628
EPOCH 97


training: 100%|██████████| 170/170 [00:05<00:00, 31.24it/s]


training_loss = 0.00028424035292573535
EPOCH 98


training: 100%|██████████| 170/170 [00:05<00:00, 30.71it/s]


training_loss = 0.0002598527535155881
EPOCH 99


training: 100%|██████████| 170/170 [00:05<00:00, 31.31it/s]


training_loss = 0.00023767101538396673
EPOCH 100


training: 100%|██████████| 170/170 [00:05<00:00, 30.79it/s]

training_loss = 0.0002173561360336862





In [15]:
torch.save(model.encoder.state_dict(), Path.cwd().parent / 'models' / 'dim-reduction-32')

Sklearn Transformer

In [None]:
class DREncoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1418,
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.geo_lv1_embedder = torch.nn.Embedding(geo_lv1_size, 16)
        self.geo_lv2_embedder = torch.nn.Embedding(geo_lv2_size, 128)
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128) 
        self.compressor = torch.nn.Linear(16+128+128, latent_dim)

    def forward(self, x):
        x_1 = self.geo_lv1_embedder(x[:, 0])
        x_2 = self.geo_lv2_embedder(x[:, 1])
        x_3 = self.geo_lv3_embedder(x[:, 2])
        x = torch.concat((x_1, x_2, x_3), dim=1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class GeoDimensionReduction(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(
            self, 
            path: PathLike,
            latent_dim: int=16, 
            geo_lv1_size: int=31,
            geo_lv2_size: int=1418,
            geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.path = path
        self.model = DREncoder(
            latent_dim, 
            geo_lv1_size,
            geo_lv2_size,
            geo_lv3_size
        )
        self.latent_dim = latent_dim
        self.geo_lv1_size = geo_lv1_size
        self.geo_lv2_size = geo_lv2_size
        self.geo_lv3_size = geo_lv3_size
        self.model.load_state_dict(torch.load(path))

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        # Convert pd to numpy
        X = X.values
        # Apply encoder
        self.model.eval()
        X = torch.from_numpy(X).type(torch.long) # type: ignore
        return self.model(X).detach().numpy()

In [None]:
x = features_df

geo_dim_reduction_pipe = Pipeline([
    ('label_encoder', preprocessor),
    ('embedder', GeoDimensionReduction(path=Path.cwd().parent / 'models' / 'dim-reduction-16')),
])

c = ColumnTransformer([
    ('geo_dim_reduction', geo_dim_reduction_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'])
], remainder='passthrough')
c.set_output(transform='pandas')
x_ = c.fit_transform(x)

### Geo Level ID: Guess Geo 3 Roll up to Geo 1 and 2 <a class="anchor" id="geo3-rollup"></a>

In [7]:
geo_lv1_counts = df['geo_level_1_id'].value_counts(normalize=True)
geo_lv1_weights = np.zeros(max_geo_lv1_id)
geo_lv1_weights[geo_lv1_counts.index] = 1.0 / geo_lv1_counts.to_numpy()

geo_lv2_counts = df['geo_level_2_id'].value_counts(normalize=True)
geo_lv2_weights = np.zeros(max_geo_lv2_id)
geo_lv2_weights[geo_lv2_counts.index] = 1.0 / geo_lv2_counts.to_numpy()

Dataset and Model Definition <a id="geo-rollup-model-def"></a>

In [8]:
# Create dataset for Autoencoder training
dataset = TensorDataset(
    torch.from_numpy(
        (
            df[['geo_level_3_id']]
                .to_numpy()
        )
    ).type(torch.long),

    torch.from_numpy(
        (
            df[['geo_level_1_id', 'geo_level_2_id']]
                .to_numpy()
        )
    ).type(torch.long)
)


class RollUpGeoLv3Encoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128)
        self.compressor = torch.nn.Linear(128, latent_dim)

    def forward(self, x):
        x = self.geo_lv3_embedder(x).squeeze(1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class RollUpGeoLv3Decoder(torch.nn.Module):
    
    def __init__(self,                  
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1418) -> None:
        super().__init__()
        self.geo_lv1_predictor = torch.nn.Linear(latent_dim, geo_lv1_size)
        self.geo_lv2_predictor = torch.nn.Linear(latent_dim, geo_lv2_size)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        x1 = self.geo_lv1_predictor(x)
        x2 = self.geo_lv2_predictor(x)
        return x1, x2


class RollUpGeoLv3AutoEncoder(torch.nn.Module):
    
    def __init__(self,
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1418,
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.encoder = RollUpGeoLv3Encoder(latent_dim, geo_lv3_size)
        self.decoder = RollUpGeoLv3Decoder(latent_dim, geo_lv1_size, geo_lv2_size)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.encoder(x)
        x = torch.nn.functional.relu(x)
        x1, x2 = self.decoder(x)
        return x1, x2

Training <a id="geo-rollup-train"></a>

In [9]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

num_epochs = 100
dataloader = DataLoader(dataset, 2*1024)
model = RollUpGeoLv3AutoEncoder().to(DEVICE)

has_weights = True
criterion_geo_lv1 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv1_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)
criterion_geo_lv2 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv2_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)

optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    print(f"EPOCH {epoch+1}")
    model.train()
    training_loss = 0.0
    for x, y in tqdm(dataloader, desc="training"):
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        a = model(x)

        loss: torch.Tensor = (
            criterion_geo_lv1(a[0], y[:, 0])
            + criterion_geo_lv2(a[1], y[:, 1])
        ) / 2

        training_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    training_loss /= len(dataloader)
    print(f"{training_loss = }")

    # model.eval()
    # validation_loss = 0.0
    # for x, y in tqdm(dataloader, desc="validation"):
    #     with torch.no_grad():
    #         x = x.to(DEVICE)
    #         y = y.to(DEVICE)
    #         a = model(x)

    #         loss: torch.Tensor = (
    #             criterion_geo_lv1(a[0], y[:, 0])
    #             + criterion_geo_lv2(a[1], y[:, 1])
    #         ) / 2

    #         validation_loss += loss.item()
    
    # validation_loss /= len(dataloader)
    # print(f"{validation_loss = }")


EPOCH 1


training: 100%|██████████| 170/170 [00:31<00:00,  5.44it/s]


training_loss = 901.0604166984558


validation: 100%|██████████| 170/170 [00:04<00:00, 34.13it/s]


validation_loss = 880.2212710380554
EPOCH 2


training: 100%|██████████| 170/170 [00:03<00:00, 45.26it/s]


training_loss = 834.2412838935852


validation: 100%|██████████| 170/170 [00:03<00:00, 50.96it/s]


validation_loss = 766.4045743942261
EPOCH 3


training: 100%|██████████| 170/170 [00:03<00:00, 48.67it/s]


training_loss = 688.718001127243


validation: 100%|██████████| 170/170 [00:03<00:00, 51.36it/s]


validation_loss = 601.8751399517059
EPOCH 4


training: 100%|██████████| 170/170 [00:03<00:00, 49.37it/s]


training_loss = 532.3944444656372


validation: 100%|██████████| 170/170 [00:03<00:00, 50.16it/s]


validation_loss = 457.96091175079346
EPOCH 5


training: 100%|██████████| 170/170 [00:03<00:00, 49.65it/s]


training_loss = 406.54374754428864


validation: 100%|██████████| 170/170 [00:03<00:00, 51.85it/s]


validation_loss = 349.4602826833725
EPOCH 6


training: 100%|██████████| 170/170 [00:03<00:00, 50.40it/s]


training_loss = 313.4714391231537


validation: 100%|██████████| 170/170 [00:03<00:00, 51.92it/s]


validation_loss = 270.68596851825714
EPOCH 7


training: 100%|██████████| 170/170 [00:03<00:00, 50.03it/s]


training_loss = 245.76537489891052


validation: 100%|██████████| 170/170 [00:03<00:00, 50.35it/s]


validation_loss = 213.28409618139267
EPOCH 8


training: 100%|██████████| 170/170 [00:03<00:00, 49.97it/s]


training_loss = 195.83314031362534


validation: 100%|██████████| 170/170 [00:03<00:00, 51.00it/s]


validation_loss = 170.71739971637726
EPOCH 9


training: 100%|██████████| 170/170 [00:03<00:00, 49.17it/s]


training_loss = 158.35300278663635


validation: 100%|██████████| 170/170 [00:03<00:00, 50.18it/s]


validation_loss = 138.52340054512024
EPOCH 10


training: 100%|██████████| 170/170 [00:03<00:00, 49.64it/s]


training_loss = 129.64220786094666


validation: 100%|██████████| 170/170 [00:03<00:00, 52.43it/s]


validation_loss = 113.74702164530754
EPOCH 11


training: 100%|██████████| 170/170 [00:03<00:00, 49.73it/s]


training_loss = 107.2998962700367


validation: 100%|██████████| 170/170 [00:03<00:00, 52.78it/s]


validation_loss = 94.41946092247963
EPOCH 12


training: 100%|██████████| 170/170 [00:03<00:00, 49.76it/s]


training_loss = 89.72978231310844


validation: 100%|██████████| 170/170 [00:03<00:00, 49.23it/s]


validation_loss = 79.15499088168144
EPOCH 13


training: 100%|██████████| 170/170 [00:03<00:00, 49.79it/s]


training_loss = 75.70215117931366


validation: 100%|██████████| 170/170 [00:03<00:00, 51.40it/s]


validation_loss = 66.90895891189575
EPOCH 14


training: 100%|██████████| 170/170 [00:03<00:00, 49.42it/s]


training_loss = 64.3319688886404


validation: 100%|██████████| 170/170 [00:03<00:00, 53.46it/s]


validation_loss = 56.887847140431404
EPOCH 15


training: 100%|██████████| 170/170 [00:03<00:00, 48.91it/s]


training_loss = 54.97446629405022


validation: 100%|██████████| 170/170 [00:03<00:00, 50.50it/s]


validation_loss = 48.65729293227196
EPOCH 16


training: 100%|██████████| 170/170 [00:03<00:00, 45.23it/s]


training_loss = 47.22879399359226


validation: 100%|██████████| 170/170 [00:03<00:00, 46.59it/s]


validation_loss = 41.788397163152695
EPOCH 17


training: 100%|██████████| 170/170 [00:03<00:00, 49.79it/s]


training_loss = 40.730060651898384


validation: 100%|██████████| 170/170 [00:03<00:00, 51.84it/s]


validation_loss = 36.02321596443653
EPOCH 18


training: 100%|██████████| 170/170 [00:03<00:00, 45.69it/s]


training_loss = 35.23706255853176


validation: 100%|██████████| 170/170 [00:03<00:00, 45.20it/s]


validation_loss = 31.155356653034687
EPOCH 19


training: 100%|██████████| 170/170 [00:03<00:00, 46.80it/s]


training_loss = 30.575345501303673


validation: 100%|██████████| 170/170 [00:03<00:00, 43.28it/s]


validation_loss = 26.98802062869072
EPOCH 20


training: 100%|██████████| 170/170 [00:03<00:00, 46.14it/s]


training_loss = 26.56069605052471


validation: 100%|██████████| 170/170 [00:03<00:00, 44.84it/s]


validation_loss = 23.40957672148943
EPOCH 21


training: 100%|██████████| 170/170 [00:03<00:00, 48.06it/s]


training_loss = 23.108037531375885


validation: 100%|██████████| 170/170 [00:03<00:00, 48.93it/s]


validation_loss = 20.329495057463646
EPOCH 22


training: 100%|██████████| 170/170 [00:03<00:00, 42.85it/s]


training_loss = 20.11953914538026


validation: 100%|██████████| 170/170 [00:03<00:00, 49.00it/s]


validation_loss = 17.678625229746103
EPOCH 23


training: 100%|██████████| 170/170 [00:03<00:00, 50.76it/s]


training_loss = 17.546048671007156


validation: 100%|██████████| 170/170 [00:03<00:00, 54.94it/s]


validation_loss = 15.404066894203424
EPOCH 24


training: 100%|██████████| 170/170 [00:03<00:00, 54.48it/s]


training_loss = 15.327416885644197


validation: 100%|██████████| 170/170 [00:02<00:00, 58.63it/s]


validation_loss = 13.42577126994729
EPOCH 25


training: 100%|██████████| 170/170 [00:03<00:00, 56.62it/s]


training_loss = 13.39046211913228


validation: 100%|██████████| 170/170 [00:02<00:00, 59.23it/s]


validation_loss = 11.71083996258676
EPOCH 26


training: 100%|██████████| 170/170 [00:02<00:00, 56.86it/s]


training_loss = 11.70066680200398


validation: 100%|██████████| 170/170 [00:02<00:00, 61.98it/s]


validation_loss = 10.213608399033546
EPOCH 27


training: 100%|██████████| 170/170 [00:03<00:00, 56.29it/s]


training_loss = 10.228176362812519


validation: 100%|██████████| 170/170 [00:02<00:00, 60.76it/s]


validation_loss = 8.909002661705017
EPOCH 28


training: 100%|██████████| 170/170 [00:03<00:00, 45.17it/s]


training_loss = 8.944894127547741


validation: 100%|██████████| 170/170 [00:03<00:00, 45.83it/s]


validation_loss = 7.780600355938077
EPOCH 29


training: 100%|██████████| 170/170 [00:03<00:00, 49.49it/s]


training_loss = 7.822151763364673


validation: 100%|██████████| 170/170 [00:03<00:00, 52.32it/s]


validation_loss = 6.797044040635228
EPOCH 30


training: 100%|██████████| 170/170 [00:03<00:00, 51.16it/s]


training_loss = 6.840491754934192


validation: 100%|██████████| 170/170 [00:03<00:00, 51.59it/s]


validation_loss = 5.927244280464947
EPOCH 31


training: 100%|██████████| 170/170 [00:03<00:00, 50.73it/s]


training_loss = 5.973993446677923


validation: 100%|██████████| 170/170 [00:03<00:00, 52.58it/s]


validation_loss = 5.174426348879933
EPOCH 32


training: 100%|██████████| 170/170 [00:03<00:00, 51.42it/s]


training_loss = 5.224088388495147


validation: 100%|██████████| 170/170 [00:03<00:00, 55.45it/s]


validation_loss = 4.51878500264138
EPOCH 33


training: 100%|██████████| 170/170 [00:03<00:00, 45.02it/s]


training_loss = 4.56581366667524


validation: 100%|██████████| 170/170 [00:03<00:00, 48.77it/s]


validation_loss = 3.9435011972673237
EPOCH 34


training: 100%|██████████| 170/170 [00:03<00:00, 49.69it/s]


training_loss = 3.987999332603067


validation: 100%|██████████| 170/170 [00:03<00:00, 51.02it/s]


validation_loss = 3.441075060516596
EPOCH 35


training: 100%|██████████| 170/170 [00:03<00:00, 49.88it/s]


training_loss = 3.4848495293408632


validation: 100%|██████████| 170/170 [00:03<00:00, 52.52it/s]


validation_loss = 3.006877742242068
EPOCH 36


training: 100%|██████████| 170/170 [00:03<00:00, 51.94it/s]


training_loss = 3.0470647690817714


validation: 100%|██████████| 170/170 [00:03<00:00, 54.49it/s]


validation_loss = 2.6284191589802504
EPOCH 37


training: 100%|██████████| 170/170 [00:03<00:00, 52.58it/s]


training_loss = 2.6646146380808204


validation: 100%|██████████| 170/170 [00:03<00:00, 53.55it/s]


validation_loss = 2.296797487186268
EPOCH 38


training: 100%|██████████| 170/170 [00:03<00:00, 49.33it/s]


training_loss = 2.330375895369798


validation: 100%|██████████| 170/170 [00:03<00:00, 51.70it/s]


validation_loss = 2.0095073180273175
EPOCH 39


training: 100%|██████████| 170/170 [00:03<00:00, 51.88it/s]


training_loss = 2.039041481213644


validation: 100%|██████████| 170/170 [00:03<00:00, 53.91it/s]


validation_loss = 1.756776801077649
EPOCH 40


training: 100%|██████████| 170/170 [00:03<00:00, 51.74it/s]


training_loss = 1.783374639460817


validation: 100%|██████████| 170/170 [00:03<00:00, 54.07it/s]


validation_loss = 1.5358335527125746
EPOCH 41


training: 100%|██████████| 170/170 [00:03<00:00, 51.44it/s]


training_loss = 1.5592337048146874


validation: 100%|██████████| 170/170 [00:03<00:00, 49.25it/s]


validation_loss = 1.3448828547261655
EPOCH 42


training: 100%|██████████| 170/170 [00:03<00:00, 46.87it/s]


training_loss = 1.3651829834561795


validation: 100%|██████████| 170/170 [00:03<00:00, 51.89it/s]


validation_loss = 1.1767758803907782
EPOCH 43


training: 100%|██████████| 170/170 [00:03<00:00, 50.19it/s]


training_loss = 1.194135080324486


validation: 100%|██████████| 170/170 [00:03<00:00, 53.19it/s]


validation_loss = 1.0302109241019934
EPOCH 44


training: 100%|██████████| 170/170 [00:03<00:00, 50.23it/s]


training_loss = 1.0454044881043956


validation: 100%|██████████| 170/170 [00:03<00:00, 52.00it/s]


validation_loss = 0.9028663381468505
EPOCH 45


training: 100%|██████████| 170/170 [00:03<00:00, 50.03it/s]


training_loss = 0.9157462641596794


validation: 100%|██████████| 170/170 [00:03<00:00, 53.31it/s]


validation_loss = 0.7924940454540774
EPOCH 46


training: 100%|██████████| 170/170 [00:03<00:00, 51.29it/s]


training_loss = 0.8033012810628861


validation: 100%|██████████| 170/170 [00:03<00:00, 52.76it/s]


validation_loss = 0.6957547381753102
EPOCH 47


training: 100%|██████████| 170/170 [00:03<00:00, 51.33it/s]


training_loss = 0.7049055070383474


validation: 100%|██████████| 170/170 [00:03<00:00, 54.55it/s]


validation_loss = 0.6102232733974233
EPOCH 48


training: 100%|██████████| 170/170 [00:03<00:00, 51.55it/s]


training_loss = 0.6179365934222005


validation: 100%|██████████| 170/170 [00:03<00:00, 52.46it/s]


validation_loss = 0.5345990961650386
EPOCH 49


training: 100%|██████████| 170/170 [00:03<00:00, 51.18it/s]


training_loss = 0.5415133415954188


validation: 100%|██████████| 170/170 [00:03<00:00, 53.77it/s]


validation_loss = 0.470081654144451
EPOCH 50


training: 100%|██████████| 170/170 [00:03<00:00, 51.03it/s]


training_loss = 0.4763955061789602


validation: 100%|██████████| 170/170 [00:03<00:00, 54.59it/s]


validation_loss = 0.41423044790280983
EPOCH 51


training: 100%|██████████| 170/170 [00:03<00:00, 51.37it/s]


training_loss = 0.4196574840461835


validation: 100%|██████████| 170/170 [00:03<00:00, 53.57it/s]


validation_loss = 0.3651147150667384
EPOCH 52


training: 100%|██████████| 170/170 [00:03<00:00, 51.99it/s]


training_loss = 0.37021496664965525


validation: 100%|██████████| 170/170 [00:03<00:00, 53.81it/s]


validation_loss = 0.3226127832895145
EPOCH 53


training: 100%|██████████| 170/170 [00:03<00:00, 51.31it/s]


training_loss = 0.32737230422208086


validation: 100%|██████████| 170/170 [00:03<00:00, 54.21it/s]


validation_loss = 0.28560320957330987
EPOCH 54


training: 100%|██████████| 170/170 [00:03<00:00, 50.95it/s]


training_loss = 0.2900321569177322


validation: 100%|██████████| 170/170 [00:03<00:00, 52.23it/s]


validation_loss = 0.2531321089772973
EPOCH 55


training: 100%|██████████| 170/170 [00:03<00:00, 48.03it/s]


training_loss = 0.257187800219981


validation: 100%|██████████| 170/170 [00:03<00:00, 51.53it/s]


validation_loss = 0.22401991867809556
EPOCH 56


training: 100%|██████████| 170/170 [00:03<00:00, 50.37it/s]


training_loss = 0.22763480132562108


validation: 100%|██████████| 170/170 [00:03<00:00, 52.08it/s]


validation_loss = 0.19855205321800895
EPOCH 57


training: 100%|██████████| 170/170 [00:03<00:00, 50.25it/s]


training_loss = 0.20176406559767202


validation: 100%|██████████| 170/170 [00:03<00:00, 51.30it/s]


validation_loss = 0.17623270480544306
EPOCH 58


training: 100%|██████████| 170/170 [00:03<00:00, 47.70it/s]


training_loss = 0.17892716714413837


validation: 100%|██████████| 170/170 [00:03<00:00, 52.35it/s]


validation_loss = 0.15635101930820383
EPOCH 59


training: 100%|██████████| 170/170 [00:03<00:00, 49.72it/s]


training_loss = 0.15868839371250942


validation: 100%|██████████| 170/170 [00:03<00:00, 52.00it/s]


validation_loss = 0.138785500486847
EPOCH 60


training: 100%|██████████| 170/170 [00:03<00:00, 49.06it/s]


training_loss = 0.14097600034438074


validation: 100%|██████████| 170/170 [00:03<00:00, 50.95it/s]


validation_loss = 0.12332721552229486
EPOCH 61


training: 100%|██████████| 170/170 [00:03<00:00, 43.91it/s]


training_loss = 0.12528516085876618


validation: 100%|██████████| 170/170 [00:04<00:00, 40.67it/s]


validation_loss = 0.10947857720020693
EPOCH 62


training: 100%|██████████| 170/170 [00:03<00:00, 48.34it/s]


training_loss = 0.11125185976561625


validation: 100%|██████████| 170/170 [00:03<00:00, 51.64it/s]


validation_loss = 0.09716736670816317
EPOCH 63


training: 100%|██████████| 170/170 [00:03<00:00, 51.25it/s]


training_loss = 0.09875075137824751


validation: 100%|██████████| 170/170 [00:03<00:00, 52.74it/s]


validation_loss = 0.08624932456586976
EPOCH 64


training: 100%|██████████| 170/170 [00:03<00:00, 50.17it/s]


training_loss = 0.08766159937658813


validation: 100%|██████████| 170/170 [00:03<00:00, 46.58it/s]


validation_loss = 0.07642529591976199
EPOCH 65


training: 100%|██████████| 170/170 [00:03<00:00, 45.03it/s]


training_loss = 0.07759334520960692


validation: 100%|██████████| 170/170 [00:03<00:00, 46.87it/s]


validation_loss = 0.06766417073959019
EPOCH 66


training: 100%|██████████| 170/170 [00:03<00:00, 46.71it/s]


training_loss = 0.06869496052968316


validation: 100%|██████████| 170/170 [00:03<00:00, 49.15it/s]


validation_loss = 0.05993146833498031
EPOCH 67


training: 100%|██████████| 170/170 [00:03<00:00, 48.34it/s]


training_loss = 0.06072422025317792


validation: 100%|██████████| 170/170 [00:03<00:00, 48.73it/s]


validation_loss = 0.053177276779024396
EPOCH 68


training: 100%|██████████| 170/170 [00:04<00:00, 41.08it/s]


training_loss = 0.053782195442181546


validation: 100%|██████████| 170/170 [00:03<00:00, 44.08it/s]


validation_loss = 0.047206903487676755
EPOCH 69


training: 100%|██████████| 170/170 [00:06<00:00, 27.29it/s]


training_loss = 0.04766375857434468


validation: 100%|██████████| 170/170 [00:04<00:00, 36.41it/s]


validation_loss = 0.04202284124767175
EPOCH 70


training: 100%|██████████| 170/170 [00:05<00:00, 32.01it/s]


training_loss = 0.04241716972319409


validation: 100%|██████████| 170/170 [00:04<00:00, 37.60it/s]


validation_loss = 0.037473597963980865
EPOCH 71


training: 100%|██████████| 170/170 [00:05<00:00, 33.26it/s]


training_loss = 0.03779600022244267


validation: 100%|██████████| 170/170 [00:04<00:00, 40.78it/s]


validation_loss = 0.03346579057688359
EPOCH 72


training: 100%|██████████| 170/170 [00:03<00:00, 45.19it/s]


training_loss = 0.03372912782651838


validation: 100%|██████████| 170/170 [00:03<00:00, 47.94it/s]


validation_loss = 0.029973491655255202
EPOCH 73


training: 100%|██████████| 170/170 [00:03<00:00, 45.61it/s]


training_loss = 0.030217645908123814


validation: 100%|██████████| 170/170 [00:03<00:00, 43.27it/s]


validation_loss = 0.026850102683965815
EPOCH 74


training: 100%|██████████| 170/170 [00:03<00:00, 43.98it/s]


training_loss = 0.027024526141758543


validation: 100%|██████████| 170/170 [00:03<00:00, 48.27it/s]


validation_loss = 0.024082501397060696
EPOCH 75


training: 100%|██████████| 170/170 [00:03<00:00, 49.36it/s]


training_loss = 0.024233724892837927


validation: 100%|██████████| 170/170 [00:03<00:00, 52.41it/s]


validation_loss = 0.021643076121108606
EPOCH 76


training: 100%|██████████| 170/170 [00:03<00:00, 47.65it/s]


training_loss = 0.021776856327051064


validation: 100%|██████████| 170/170 [00:03<00:00, 51.42it/s]


validation_loss = 0.019481877832731698
EPOCH 77


training: 100%|██████████| 170/170 [00:03<00:00, 49.93it/s]


training_loss = 0.019591691012465162


validation: 100%|██████████| 170/170 [00:03<00:00, 51.64it/s]


validation_loss = 0.017556187507580034
EPOCH 78


training: 100%|██████████| 170/170 [00:03<00:00, 50.55it/s]


training_loss = 0.017648447392275557


validation: 100%|██████████| 170/170 [00:03<00:00, 52.54it/s]


validation_loss = 0.015833619261684362
EPOCH 79


training: 100%|██████████| 170/170 [00:03<00:00, 50.96it/s]


training_loss = 0.015916620424832217


validation: 100%|██████████| 170/170 [00:03<00:00, 52.98it/s]


validation_loss = 0.01429078086221125
EPOCH 80


training: 100%|██████████| 170/170 [00:03<00:00, 51.26it/s]


training_loss = 0.014359000506374286


validation: 100%|██████████| 170/170 [00:03<00:00, 53.49it/s]


validation_loss = 0.012899928806291427
EPOCH 81


training: 100%|██████████| 170/170 [00:03<00:00, 49.92it/s]


training_loss = 0.012962652228452498


validation: 100%|██████████| 170/170 [00:03<00:00, 53.01it/s]


validation_loss = 0.01165673992727534
EPOCH 82


training: 100%|██████████| 170/170 [00:03<00:00, 46.15it/s]


training_loss = 0.011717292585672112


validation: 100%|██████████| 170/170 [00:03<00:00, 46.58it/s]


validation_loss = 0.010543174510530662
EPOCH 83


training: 100%|██████████| 170/170 [00:03<00:00, 46.54it/s]


training_loss = 0.010590975003651693


validation: 100%|██████████| 170/170 [00:03<00:00, 46.85it/s]


validation_loss = 0.009523174800051493
EPOCH 84


training: 100%|██████████| 170/170 [00:03<00:00, 42.76it/s]


training_loss = 0.00955601889108948


validation: 100%|██████████| 170/170 [00:03<00:00, 45.27it/s]


validation_loss = 0.00861092583363643
EPOCH 85


training: 100%|██████████| 170/170 [00:05<00:00, 31.32it/s]


training_loss = 0.008643249680972076


validation: 100%|██████████| 170/170 [00:04<00:00, 41.95it/s]


validation_loss = 0.007796171144946129
EPOCH 86


training: 100%|██████████| 170/170 [00:03<00:00, 49.73it/s]


training_loss = 0.007821960927685723


validation: 100%|██████████| 170/170 [00:03<00:00, 51.52it/s]


validation_loss = 0.007054891324514756
EPOCH 87


training: 100%|██████████| 170/170 [00:03<00:00, 50.50it/s]


training_loss = 0.0070781392641947605


validation: 100%|██████████| 170/170 [00:03<00:00, 50.49it/s]


validation_loss = 0.006392065369254851
EPOCH 88


training: 100%|██████████| 170/170 [00:03<00:00, 50.11it/s]


training_loss = 0.0064132307370528


validation: 100%|██████████| 170/170 [00:03<00:00, 49.58it/s]


validation_loss = 0.005788012404991605
EPOCH 89


training: 100%|██████████| 170/170 [00:03<00:00, 49.84it/s]


training_loss = 0.0058065719740625354


validation: 100%|██████████| 170/170 [00:03<00:00, 49.34it/s]


validation_loss = 0.005248505553936411
EPOCH 90


training: 100%|██████████| 170/170 [00:03<00:00, 49.61it/s]


training_loss = 0.005263743119030551


validation: 100%|██████████| 170/170 [00:03<00:00, 49.36it/s]


validation_loss = 0.004754207205223793
EPOCH 91


training: 100%|██████████| 170/170 [00:03<00:00, 49.85it/s]


training_loss = 0.0047656599617766915


validation: 100%|██████████| 170/170 [00:03<00:00, 49.13it/s]


validation_loss = 0.004307987331230834
EPOCH 92


training: 100%|██████████| 170/170 [00:03<00:00, 49.69it/s]


training_loss = 0.00431810677764588


validation: 100%|██████████| 170/170 [00:03<00:00, 51.74it/s]


validation_loss = 0.003907082950718177
EPOCH 93


training: 100%|██████████| 170/170 [00:03<00:00, 50.56it/s]


training_loss = 0.00391772586863226


validation: 100%|██████████| 170/170 [00:03<00:00, 51.07it/s]


validation_loss = 0.003548207956555416
EPOCH 94


training: 100%|██████████| 170/170 [00:03<00:00, 49.41it/s]


training_loss = 0.0035573803279476124


validation: 100%|██████████| 170/170 [00:03<00:00, 52.43it/s]


validation_loss = 0.0032211416801146697
EPOCH 95


training: 100%|██████████| 170/170 [00:03<00:00, 50.66it/s]


training_loss = 0.003229278645449085


validation: 100%|██████████| 170/170 [00:03<00:00, 51.16it/s]


validation_loss = 0.0029273980617290363
EPOCH 96


training: 100%|██████████| 170/170 [00:03<00:00, 48.66it/s]


training_loss = 0.0029343719133976265


validation: 100%|██████████| 170/170 [00:03<00:00, 49.32it/s]


validation_loss = 0.0026598027552608983
EPOCH 97


training: 100%|██████████| 170/170 [00:03<00:00, 49.04it/s]


training_loss = 0.002665606564733025


validation: 100%|██████████| 170/170 [00:03<00:00, 51.22it/s]


validation_loss = 0.002418155485429452
EPOCH 98


training: 100%|██████████| 170/170 [00:03<00:00, 50.40it/s]


training_loss = 0.002423655522761692


validation: 100%|██████████| 170/170 [00:03<00:00, 49.92it/s]


validation_loss = 0.0021981279824103694
EPOCH 99


training: 100%|██████████| 170/170 [00:03<00:00, 51.21it/s]


training_loss = 0.002203574601935543


validation: 100%|██████████| 170/170 [00:03<00:00, 50.73it/s]


validation_loss = 0.001999931476348138
EPOCH 100


training: 100%|██████████| 170/170 [00:03<00:00, 49.87it/s]


training_loss = 0.0020042338369421486


validation: 100%|██████████| 170/170 [00:03<00:00, 49.64it/s]

validation_loss = 0.0018194816748291487





In [10]:
torch.save(model.encoder.state_dict(), Path.cwd().parent / 'models' / 'geo3-rollup-16')

sklearn Transformer

In [6]:
class RollUpGeoLv3Encoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128)
        self.compressor = torch.nn.Linear(128, latent_dim)

    def forward(self, x):
        x = self.geo_lv3_embedder(x).squeeze(1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class GeoLv3Rollup(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(
            self, 
            path: PathLike,
            latent_dim: int=16, 
            geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.path = path
        self.model = RollUpGeoLv3Encoder(
            latent_dim, 
            geo_lv3_size
        )
        self.latent_dim = latent_dim
        self.geo_lv3_size = geo_lv3_size
        self.model.load_state_dict(torch.load(path))

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        # Convert pd to numpy
        if isinstance(X, pd.DataFrame):
            X = X.values # type: ignore
        # Apply encoder
        self.model.eval()
        X = torch.from_numpy(X).type(torch.long) # type: ignore
        return self.model(X).detach().numpy()

In [21]:
x = features_df

# Demonstration on how to use the above in a column transformer
geo3_rollup_preprocessor = ColumnTransformer([
        ('geo3_le', geo_lv3_le, ['geo_level_3_id']),
    ], 
    remainder='drop', 
    verbose_feature_names_out=False,
).set_output(transform='pandas')

geo_rollup_pipe = Pipeline([
    ('label_encoder', geo3_rollup_preprocessor),
    ('embedder', GeoLv3Rollup(path=Path.cwd().parent / 'models' / 'geo3-rollup-16')),
])

c = ColumnTransformer([
    ('geo_rollup', geo_rollup_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'])
], remainder='drop')
c.set_output(transform='pandas')

x_ = c.fit_transform(features_df)