# Feature Extraction Notebook

Create Ready-to-use Sklearn Transformers

## Table of Contents:
1. [Imports](#imports)
1. [Constants](#constants)
1. [Geo Level Feature Extraction](#geo-level-feature-extraction)  
    1. [Geo Level ID: Dimensionality Reduction](#geo-dim-reduction)  
        1. [Model and Dataset Definition](#geo-dim-reduction-model-def)
        1. [Training](#geo-dim-reduction-train)
    1. [Geo Level ID: Guess Geo 3 Roll up to Geo 1 and 2](#geo3-rollup)
        1. [Model and Dataset Definition](#geo-rollup-model-def)
        1. [Training](#geo-rollup-train)
1. [All Features](#all-feature-extraction)
    1. [Simple Autoencoder](#all-feature-ae)

## Imports <a class="anchor" id="imports"></a>

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
from typing import Tuple, Any
from tqdm import tqdm
from pathlib import Path
from os import PathLike

## Constants <a id="constants"></a>

In [9]:
# Replace these with your file paths

BASE_DIR = Path('d:\\', 'ml_competitions', 'gorkha_earthquake')
DATA_DIR = BASE_DIR / 'data' / 'raw'
MODEL_DIR = BASE_DIR / 'models'
SUBMISSION_DIR = BASE_DIR / 'submissions'

TRAINING_FEATURES_PATH = DATA_DIR / "train_values.csv"
TRAINING_LABELS_PATH = DATA_DIR / "train_labels.csv"
TEST_FEATURES_PATH = DATA_DIR / "test_values.csv"
SUBMISSION_FORMAT_PATH = DATA_DIR / "submission_format.csv"

## Load Data

In [3]:
features_df         = pd.read_csv(TRAINING_FEATURES_PATH,   index_col=0)
labels_df           = pd.read_csv(TRAINING_LABELS_PATH,     index_col=0) - 1
test_features_df    = pd.read_csv(TEST_FEATURES_PATH,       index_col=0)

## Geo Level Feature Extraction <a class="anchor" id="geo-level-feature-extraction"></a>

Label Encode <a id="geo-dim-reduction-le"></a>

In [4]:
import pickle
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

df = pd.concat([features_df, test_features_df])

le1 = LabelEncoder().fit(df['geo_level_1_id'])
le2 = LabelEncoder().fit(df['geo_level_2_id'])
le3 = LabelEncoder().fit(df['geo_level_3_id'])

eval = le3.transform(test_features_df['geo_level_3_id'])

# Save All Label Encoders
with open(Path.cwd().parent / 'models' / 'geo-lv-1-label-encoder.pickle', 'wb') as f:
    pickle.dump(le1, f)
with open(Path.cwd().parent / 'models' / 'geo-lv-2-label-encoder.pickle', 'wb') as f:
    pickle.dump(le2, f)
with open(Path.cwd().parent / 'models' / 'geo-lv-3-label-encoder.pickle', 'wb') as f:
    pickle.dump(le3, f)

del le1, le2, le3

# Load All Label Encoders
with open(Path.cwd().parent / 'models' / 'geo-lv-1-label-encoder.pickle', 'rb') as f:
    le1 = pickle.load(f)
with open(Path.cwd().parent / 'models' / 'geo-lv-2-label-encoder.pickle', 'rb') as f:
    le2 = pickle.load(f)
with open(Path.cwd().parent / 'models' / 'geo-lv-3-label-encoder.pickle', 'rb') as f:
    le3 = pickle.load(f)

# Prepare Transformers
geo_lv1_le = FunctionTransformer(
    func=lambda x: np.array(le1.transform(x.values.ravel())).reshape(-1, 1),
    feature_names_out='one-to-one'
)

geo_lv2_le = FunctionTransformer(
    func=lambda x: np.array(le2.transform(x.values.ravel())).reshape(-1, 1), 
    feature_names_out='one-to-one'
)

geo_lv3_le = FunctionTransformer(
    func=lambda x: np.array(le3.transform(x.values.ravel())).reshape(-1, 1), 
    feature_names_out='one-to-one'
)

In [5]:
# Demonstration on how to use the above in a column transformer
preprocessor = ColumnTransformer([
    ('geo1_le', geo_lv1_le, ['geo_level_1_id']),
    ('geo2_le', geo_lv2_le, ['geo_level_2_id']),
    ('geo3_le', geo_lv3_le, ['geo_level_3_id']),
], remainder='drop', verbose_feature_names_out=False)

preprocessor.set_output(transform='pandas')

pipeline = Pipeline([
    ('prep', preprocessor),
])

In [6]:
df = pd.concat([features_df, test_features_df])
df: pd.DataFrame = preprocessor.fit_transform(df) # type: ignore

# Get max ID in Geo Level 1
max_geo_lv1_id = df['geo_level_1_id'].max() + 1
# Get max ID in Geo Level 2
max_geo_lv2_id = df['geo_level_2_id'].max() + 1
# Get max ID in Geo Level 3
max_geo_lv3_id = df['geo_level_3_id'].max() + 1

print(
    max_geo_lv1_id,
    max_geo_lv2_id,
    max_geo_lv3_id
)

31 1418 11861


### Geo Level ID: Dimensionality Reduction <a class="anchor" id="geo-dim-reduction"></a>

In [16]:
geo_lv1_counts = df['geo_level_1_id'].value_counts(normalize=True)
geo_lv1_weights = np.zeros(max_geo_lv1_id)
geo_lv1_weights[geo_lv1_counts.index] = 1.0 / geo_lv1_counts.to_numpy()

geo_lv2_counts = df['geo_level_2_id'].value_counts(normalize=True)
geo_lv2_weights = np.zeros(max_geo_lv2_id)
geo_lv2_weights[geo_lv2_counts.index] = 1.0 / geo_lv2_counts.to_numpy()

geo_lv3_counts = df['geo_level_3_id'].value_counts(normalize=True)
geo_lv3_weights = np.zeros(max_geo_lv3_id)
geo_lv3_weights[geo_lv3_counts.index] = 1.0 / geo_lv3_counts.to_numpy()

Define Dataset and Model <a id="geo-dim-reduction-model-def"></a>

In [17]:
# Create dataset for Autoencoder training
dataset = TensorDataset(
    torch.from_numpy(
        (
            df[['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']]
                .to_numpy()
        )
    ).type(torch.long),

    torch.from_numpy(
        (
            df[['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']]
                .to_numpy()
        )
    ).type(torch.long)
)


class DREncoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1428,
                 geo_lv3_size: int=12568) -> None:
        super().__init__()
        self.geo_lv1_embedder = torch.nn.Embedding(geo_lv1_size, 16)
        self.geo_lv2_embedder = torch.nn.Embedding(geo_lv2_size, 512)
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 1024) 
        self.compressor = torch.nn.Linear(16+512+1024, latent_dim)

    def forward(self, x):
        x_1 = self.geo_lv1_embedder(x[:, 0])
        x_2 = self.geo_lv2_embedder(x[:, 1])
        x_3 = self.geo_lv3_embedder(x[:, 2])
        x = torch.concat((x_1, x_2, x_3), dim=1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class DRDecoder(torch.nn.Module):
    
    def __init__(self,                  
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1428,
                 geo_lv3_size: int=12568) -> None:
        super().__init__()
        self.geo_lv1_predictor = torch.nn.Linear(latent_dim, geo_lv1_size)
        self.geo_lv2_predictor = torch.nn.Linear(latent_dim, geo_lv2_size)
        self.geo_lv3_predictor = torch.nn.Linear(latent_dim, geo_lv3_size)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        x1 = self.geo_lv1_predictor(x)
        x2 = self.geo_lv2_predictor(x)
        x3 = self.geo_lv3_predictor(x)
        return x1, x2, x3


class DRAutoEncoder(torch.nn.Module):
    
    def __init__(self,
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1428,
                 geo_lv3_size: int=12568) -> None:
        super().__init__()
        self.encoder = DREncoder(latent_dim, geo_lv1_size, geo_lv2_size, geo_lv3_size)
        self.decoder = DRDecoder(latent_dim, geo_lv1_size, geo_lv2_size, geo_lv3_size)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        x = self.encoder(x)
        x = torch.nn.functional.relu(x)
        x1, x2, x3 = self.decoder(x)
        return x1, x2, x3

Training <a id="geo-dim-reduction-train"></a>

In [35]:
from sklearn.utils.class_weight import compute_class_weight

In [26]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

num_epochs = 10
dataloader = DataLoader(dataset, 128)
model = DRAutoEncoder(
    latent_dim=16, 
    geo_lv1_size=max_geo_lv1_id, 
    geo_lv2_size=max_geo_lv2_id, 
    geo_lv3_size=max_geo_lv3_id
).to(DEVICE)

has_weights = True

geo_lv1_weights = compute_class_weight('balanced', classes=df['geo_level_1_id'].unique(), y=df['geo_level_1_id'].values)
geo_lv2_weights = compute_class_weight('balanced', classes=df['geo_level_2_id'].unique(), y=df['geo_level_2_id'].values)
geo_lv3_weights = compute_class_weight('balanced', classes=df['geo_level_3_id'].unique(), y=df['geo_level_3_id'].values)

criterion_geo_lv1 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv1_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)
criterion_geo_lv2 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv2_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)
criterion_geo_lv3 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv3_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    print(f"EPOCH {epoch+1}")
    model.train()
    training_loss = 0.0
    for x, y in tqdm(dataloader, desc="training"):
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        a = model(x)

        loss: torch.Tensor = (
            criterion_geo_lv1(a[0], y[:, 0])
            + criterion_geo_lv2(a[1], y[:, 1])
            + criterion_geo_lv3(a[2], y[:, 2])
        ) / 3

        training_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    training_loss /= len(dataloader)
    print(f"{training_loss = }")


EPOCH 1


training: 100%|██████████| 2715/2715 [00:40<00:00, 66.80it/s]


training_loss = 1.6755006012721176
EPOCH 2


training: 100%|██████████| 2715/2715 [00:40<00:00, 66.93it/s]


training_loss = 0.18183904730030515
EPOCH 3


training: 100%|██████████| 2715/2715 [00:40<00:00, 66.85it/s]


training_loss = 0.06992902428110714
EPOCH 4


training: 100%|██████████| 2715/2715 [00:41<00:00, 64.92it/s]


training_loss = 0.038181437891659294
EPOCH 5


training: 100%|██████████| 2715/2715 [00:43<00:00, 62.11it/s]


training_loss = 0.023581333787345136
EPOCH 6


training: 100%|██████████| 2715/2715 [00:41<00:00, 65.70it/s]


training_loss = 0.01687401076191333
EPOCH 7


training: 100%|██████████| 2715/2715 [00:41<00:00, 65.46it/s]


training_loss = 0.012068605263272393
EPOCH 8


training: 100%|██████████| 2715/2715 [00:41<00:00, 64.68it/s]


training_loss = 0.009244455388306793
EPOCH 9


training: 100%|██████████| 2715/2715 [00:41<00:00, 65.01it/s]


training_loss = 0.006961691445714115
EPOCH 10


training: 100%|██████████| 2715/2715 [00:41<00:00, 64.99it/s]

training_loss = 0.0062905947200633016





In [27]:
torch.save(model.encoder.state_dict(), MODEL_DIR / 'dim-reduction-16-large-w.pt')

Sklearn Transformer

In [None]:
class DREncoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1418,
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.geo_lv1_embedder = torch.nn.Embedding(geo_lv1_size, 16)
        self.geo_lv2_embedder = torch.nn.Embedding(geo_lv2_size, 128)
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128) 
        self.compressor = torch.nn.Linear(16+128+128, latent_dim)

    def forward(self, x):
        x_1 = self.geo_lv1_embedder(x[:, 0])
        x_2 = self.geo_lv2_embedder(x[:, 1])
        x_3 = self.geo_lv3_embedder(x[:, 2])
        x = torch.concat((x_1, x_2, x_3), dim=1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class GeoDimensionReduction(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(
            self, 
            path: PathLike,
            latent_dim: int=16, 
            geo_lv1_size: int=31,
            geo_lv2_size: int=1418,
            geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.path = path
        self.model = DREncoder(
            latent_dim, 
            geo_lv1_size,
            geo_lv2_size,
            geo_lv3_size
        )
        self.latent_dim = latent_dim
        self.geo_lv1_size = geo_lv1_size
        self.geo_lv2_size = geo_lv2_size
        self.geo_lv3_size = geo_lv3_size
        self.model.load_state_dict(torch.load(path))

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        # Convert pd to numpy
        X = X.values
        # Apply encoder
        self.model.eval()
        X = torch.from_numpy(X).type(torch.long) # type: ignore
        return self.model(X).detach().numpy()

In [None]:
x = features_df

geo_dim_reduction_pipe = Pipeline([
    ('label_encoder', preprocessor),
    ('embedder', GeoDimensionReduction(path=Path.cwd().parent / 'models' / 'dim-reduction-16')),
])

c = ColumnTransformer([
    ('geo_dim_reduction', geo_dim_reduction_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'])
], remainder='passthrough')
c.set_output(transform='pandas')
x_ = c.fit_transform(x)

### Geo Level ID: Guess Geo 3 Roll up to Geo 1 and 2 <a class="anchor" id="geo3-rollup"></a>

In [7]:
geo_lv1_counts = df['geo_level_1_id'].value_counts(normalize=True)
geo_lv1_weights = np.zeros(max_geo_lv1_id)
geo_lv1_weights[geo_lv1_counts.index] = 1.0 / geo_lv1_counts.to_numpy()

geo_lv2_counts = df['geo_level_2_id'].value_counts(normalize=True)
geo_lv2_weights = np.zeros(max_geo_lv2_id)
geo_lv2_weights[geo_lv2_counts.index] = 1.0 / geo_lv2_counts.to_numpy()

Dataset and Model Definition <a id="geo-rollup-model-def"></a>

In [13]:
# Create dataset for Autoencoder training
dataset = TensorDataset(
    torch.from_numpy(
        (
            df[['geo_level_3_id']]
                .to_numpy()
        )
    ).type(torch.long),

    torch.from_numpy(
        (
            df[['geo_level_1_id', 'geo_level_2_id']]
                .to_numpy()
        )
    ).type(torch.long)
)


class RollUpGeoLv3Encoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 1024)
        self.compressor = torch.nn.Linear(1024, latent_dim)

    def forward(self, x):
        x = self.geo_lv3_embedder(x).squeeze(1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class RollUpGeoLv3Decoder(torch.nn.Module):
    
    def __init__(self,                  
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1418) -> None:
        super().__init__()
        self.geo_lv1_predictor = torch.nn.Linear(latent_dim, geo_lv1_size)
        self.geo_lv2_predictor = torch.nn.Linear(latent_dim, geo_lv2_size)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        x1 = self.geo_lv1_predictor(x)
        x2 = self.geo_lv2_predictor(x)
        return x1, x2


class RollUpGeoLv3AutoEncoder(torch.nn.Module):
    
    def __init__(self,
                 latent_dim: int=16, 
                 geo_lv1_size: int=31, 
                 geo_lv2_size: int=1418,
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.encoder = RollUpGeoLv3Encoder(latent_dim, geo_lv3_size)
        self.decoder = RollUpGeoLv3Decoder(latent_dim, geo_lv1_size, geo_lv2_size)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.encoder(x)
        x = torch.nn.functional.relu(x)
        x1, x2 = self.decoder(x)
        return x1, x2

Training <a id="geo-rollup-train"></a>

In [14]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

num_epochs = 10
dataloader = DataLoader(dataset, 128)
model = RollUpGeoLv3AutoEncoder().to(DEVICE)

has_weights = True
criterion_geo_lv1 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv1_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)
criterion_geo_lv2 = torch.nn.CrossEntropyLoss(
    (
        torch
            .from_numpy(geo_lv2_weights)
            .type(torch.float)
            .to(DEVICE)
    ) if has_weights else None
)

optimizer = torch.optim.Adam(model.parameters())

for epoch in range(num_epochs):
    print(f"EPOCH {epoch+1}")
    model.train()
    training_loss = 0.0
    for x, y in tqdm(dataloader, desc="training"):
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        a = model(x)

        loss: torch.Tensor = (
            criterion_geo_lv1(a[0], y[:, 0])
            + criterion_geo_lv2(a[1], y[:, 1])
        ) / 2

        training_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    training_loss /= len(dataloader)
    print(f"{training_loss = }")

EPOCH 1


training: 100%|██████████| 2715/2715 [00:37<00:00, 71.49it/s]


training_loss = 3.782427275905293
EPOCH 2


training: 100%|██████████| 2715/2715 [00:38<00:00, 71.13it/s]


training_loss = 1.414766142370512
EPOCH 3


training: 100%|██████████| 2715/2715 [00:37<00:00, 71.54it/s]


training_loss = 0.6944428952304478
EPOCH 4


training: 100%|██████████| 2715/2715 [00:36<00:00, 74.16it/s]


training_loss = 0.40136595757825655
EPOCH 5


training: 100%|██████████| 2715/2715 [00:35<00:00, 76.02it/s]


training_loss = 0.25141762202111645
EPOCH 6


training: 100%|██████████| 2715/2715 [00:35<00:00, 76.61it/s]


training_loss = 0.16840937440485126
EPOCH 7


training: 100%|██████████| 2715/2715 [00:35<00:00, 76.84it/s]


training_loss = 0.11941648974440543
EPOCH 8


training: 100%|██████████| 2715/2715 [00:35<00:00, 76.68it/s]


training_loss = 0.09011306657158284
EPOCH 9


training: 100%|██████████| 2715/2715 [00:35<00:00, 76.45it/s]


training_loss = 0.06959339500226198
EPOCH 10


training: 100%|██████████| 2715/2715 [00:35<00:00, 76.09it/s]

training_loss = 0.0541837265434336





In [15]:
torch.save(model.encoder.state_dict(), MODEL_DIR / 'geo3-rollup-16-large-1024')

sklearn Transformer

In [6]:
class RollUpGeoLv3Encoder(torch.nn.Module):

    def __init__(self, 
                 latent_dim: int=16, 
                 geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.geo_lv3_embedder = torch.nn.Embedding(geo_lv3_size, 128)
        self.compressor = torch.nn.Linear(128, latent_dim)

    def forward(self, x):
        x = self.geo_lv3_embedder(x).squeeze(1)
        x = torch.nn.functional.relu(x)
        return self.compressor(x)


class GeoLv3Rollup(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(
            self, 
            path: PathLike,
            latent_dim: int=16, 
            geo_lv3_size: int=11861) -> None:
        super().__init__()
        self.path = path
        self.model = RollUpGeoLv3Encoder(
            latent_dim, 
            geo_lv3_size
        )
        self.latent_dim = latent_dim
        self.geo_lv3_size = geo_lv3_size
        self.model.load_state_dict(torch.load(path))

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        # Convert pd to numpy
        if isinstance(X, pd.DataFrame):
            X = X.values # type: ignore
        # Apply encoder
        self.model.eval()
        X = torch.from_numpy(X).type(torch.long) # type: ignore
        return self.model(X).detach().numpy()

In [21]:
x = features_df

# Demonstration on how to use the above in a column transformer
geo3_rollup_preprocessor = ColumnTransformer([
        ('geo3_le', geo_lv3_le, ['geo_level_3_id']),
    ], 
    remainder='drop', 
    verbose_feature_names_out=False,
).set_output(transform='pandas')

geo_rollup_pipe = Pipeline([
    ('label_encoder', geo3_rollup_preprocessor),
    ('embedder', GeoLv3Rollup(path=MODEL_DIR / 'geo3-rollup-16')),
])

c = ColumnTransformer([
    ('geo_rollup', geo_rollup_pipe, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'])
], remainder='drop')
c.set_output(transform='pandas')

x_ = c.fit_transform(features_df)