In [1]:
import pandas as pd
import os
from dataset.datasets import CustomDataset
from torch.utils.data import DataLoader
from model.layers import DenseFeatureLayer, TransformerBlock

In [2]:
DATA_PATH = os.path.join(os.path.abspath("."), "data/encoded_dataset.csv")

In [3]:
df = pd.read_csv(DATA_PATH)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,salary,age_log,fnlwgt_log,capital_loss_log,hours_per_week_bins,native_country
0,0,3,0,12.0,1,0,5,1,0,0,1,3.89182,11.526039,7.551187,2,1
1,1,3,4,14.0,0,3,1,1,1,10520,1,3.78419,12.374743,0.0,3,1
2,2,3,3,10.0,0,0,4,0,0,0,0,3.637586,11.474029,0.0,2,1
3,3,4,6,15.0,1,8,0,2,1,0,1,3.637586,11.633788,0.0,2,1
4,4,5,7,10.0,1,7,5,0,0,0,0,3.73767,11.31809,0.0,3,1


In [5]:
categorical_columns = ['workclass', "education", "marital-status", "occupation", 'relationship',
                       'race', 'sex', 'hours_per_week_bins', 'native_country']
numerical_columns = ['age_log', 'fnlwgt_log', "education-num", 'capital-gain', 'capital_loss_log']
target = 'salary'

In [6]:
dataset = CustomDataset(df=df, numerical_columns=numerical_columns,
                       categorical_columns=categorical_columns, target=target)

In [7]:
dataset[0]

({'workclass': tensor(3),
  'education': tensor(0),
  'marital-status': tensor(1),
  'occupation': tensor(0),
  'relationship': tensor(5),
  'race': tensor(1),
  'sex': tensor(0),
  'hours_per_week_bins': tensor(2),
  'native_country': tensor(1)},
 array([ 3.8918203 , 11.5260391 , 12.        ,  0.        ,  7.55118687]),
 1)

In [8]:
dataloader = DataLoader(dataset, batch_size=2)

In [9]:
batch = next(iter(dataloader))

In [10]:
batch[0]

{'workclass': tensor([3, 3]),
 'education': tensor([0, 4]),
 'marital-status': tensor([1, 0]),
 'occupation': tensor([0, 3]),
 'relationship': tensor([5, 1]),
 'race': tensor([1, 1]),
 'sex': tensor([0, 1]),
 'hours_per_week_bins': tensor([2, 3]),
 'native_country': tensor([1, 1])}

In [15]:
features = DenseFeatureLayer(num_unique_values_dict=dataset.num_unique_values_dict,
                          embedding_size=5,
                          numerical_columns=len(numerical_columns))

trans_block = TransformerBlock(features.output_shape, skip=True)

In [16]:
res = features(batch[0], batch[1])

In [17]:
res = trans_block(res)

TypeError: sqrt(): argument 'input' (position 1) must be Tensor, not float

In [14]:
res

tensor([[-0.2690, -0.7310, -0.7300, -0.7310, -0.7256,  0.2689, -0.7310, -0.2689,
         -0.7310, -0.7310, -0.2690, -0.2690,  0.2692, -0.2689, -0.2689, -0.2689,
         -0.7310,  0.7310, -0.7310,  0.2692, -0.7310,  0.7310,  0.2671, -0.2689,
          0.2690,  0.7309,  0.7310, -0.2689,  0.7309, -0.7310,  0.7311,  0.7302,
         -0.7309,  0.2692,  0.2689,  0.7310, -0.2689, -0.2690,  0.7309,  0.7308,
          0.2689,  0.2690, -0.7309, -0.7310, -0.2685,  0.2688,  0.2676,  0.2688,
          0.7310,  0.2689],
        [ 0.7310,  0.2690,  0.2688,  0.2689,  0.2669, -0.7310,  0.2690,  0.7310,
          0.2689,  0.2690,  0.7310,  0.7310, -0.7307,  0.7310,  0.7310,  0.7309,
          0.2690, -0.2690,  0.2689, -0.7308,  0.2690, -0.2689, -0.7259,  0.7310,
         -0.7310, -0.2691, -0.2690,  0.7310, -0.2689,  0.2689, -0.2689, -0.2686,
          0.2690, -0.7308, -0.7310, -0.2689,  0.7310,  0.7310, -0.2689, -0.2689,
         -0.7310, -0.7309,  0.2689,  0.2689,  0.7298, -0.7307, -0.7272, -0.7306,
