In [1]:
! git clone https://github.com/Firyuza/SiriusDL.git

Cloning into 'SiriusDL'...
remote: Enumerating objects: 195, done.[K
remote: Counting objects: 100% (195/195), done.[K
remote: Compressing objects: 100% (157/157), done.[K
remote: Total 195 (delta 50), reused 168 (delta 27), pack-reused 0[K
Receiving objects: 100% (195/195), 39.63 MiB | 27.33 MiB/s, done.
Resolving deltas: 100% (50/50), done.


In [2]:
from IPython.display import clear_output
! pip install pytorch_lightning
clear_output()

In [3]:
import torch
print(torch.__version__)

import torch.optim as optim
import torch.utils.data as data_utils

from torch.utils.tensorboard import SummaryWriter
from pytorch_lightning.metrics import Accuracy

from SiriusDL.week08.deepFM.network import DeepFMNet
from SiriusDL.week08.deepFM.data_loader import CustomDataset

1.8.1+cu101


In [4]:
%load_ext tensorboard

In [5]:
EPOCHS = 20
EMBEDDING_SIZE = 5
BATCH_SIZE = 512
NROF_LAYERS = 3
NROF_NEURONS = 50
DEEP_OUTPUT_SIZE = 50
NROF_OUT_CLASSES = 1
LEARNING_RATE = 3e-4
TRAIN_PATH = '/content/SiriusDL/week08/data/train_adult.pickle'
VALID_PATH = '/content/SiriusDL/week08/data/valid_adult.pickle'

In [6]:
import numpy as np
import pandas as pd
import pickle

In [7]:
embedding_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
                                 'sex', 'native-country']
nrof_emb_categories = {}
unique_categories = {}

with open('/content/SiriusDL/week08/data/train_adult.pickle', 'rb') as f:
    data, _, _ = pickle.load(f)

for cat in embedding_columns:
    nrof_unique = np.unique(data[cat].values.astype(np.str))
    # data.groupby(cat).agg({cat: 'count'})
    unique_categories[cat] = nrof_unique
    nrof_emb_categories[cat] = len(nrof_unique)
    data[cat + '_cat'] = [np.where(nrof_unique == val)[0][0] for i, val in enumerate(data[cat].values.astype(np.str))]

data.dropna(axis=0,inplace=True)
min_age = data.age.min()  
max_age = data.age.max()
step = (max_age - min_age)/10

feature_list = data['age'].unique()

for i in feature_list:
    mask = (data['age'] == i)
    g = np.floor((i - min_age)/ step)
    data.loc[mask, 'age' + '_bin'] = g

data = data.drop(columns=["age"])  
data.age_bin = data.age_bin.astype(int)
with open('/content/SiriusDL/week08/data/train_adult.pickle', 'wb') as f:
    pickle.dump([data, nrof_emb_categories, unique_categories], f)


with open('/content/SiriusDL/week08/data/valid_adult.pickle', 'rb') as f:
    data, _, _ = pickle.load(f)

for cat in embedding_columns:
    data[cat + '_cat'] = [np.where(unique_categories[cat] == val)[0][0] for i, val in enumerate(data[cat].values.astype(np.str))]
data.dropna(axis=0,inplace=True)
feature_list = data['age'].unique()

for i in feature_list:
    mask = (data['age'] == i)
    g = np.floor((i - min_age)/ step)
    data.loc[mask, 'age' + '_bin'] = g

data = data.drop(columns=["age"])  
data.age_bin = data.age_bin.astype(int)
with open('/content/SiriusDL/week08/data/valid_adult.pickle', 'wb') as f:
    pickle.dump([data, nrof_emb_categories, unique_categories], f)

In [8]:
import torch
from torch.utils.data import Dataset

In [9]:
class CustomDataset(Dataset):
    def __init__(self, dataset_path):
        with open(dataset_path, 'rb') as f:
            data, self.nrof_emb_categories, self.unique_categories = pickle.load(f)

        self.embedding_columns = ['workclass_cat', 'education_cat', 'marital-status_cat', 'occupation_cat',
                                  'relationship_cat', 'race_cat',
                                  'sex_cat', 'native-country_cat']
        self.nrof_emb_categories = {key + '_cat': val for key, val in self.nrof_emb_categories.items()}
        self.numeric_columns = ['age_bin', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
                                'hours-per-week']

        self.columns = self.embedding_columns + self.numeric_columns

        self.X = data[self.columns].reset_index(drop=True)
        self.y = np.asarray([0 if el == '<50k' else 1 for el in data['salary'].values], dtype=np.int32)

        return

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):

        row = self.X.take([idx], axis=0)

        row = {col: torch.tensor(row[col].values, dtype=torch.float32) for i, col in enumerate(self.columns)}

        return row, self.y[idx]

In [10]:
class DeepFM:
    def __init__(self):
        self.train_dataset = CustomDataset(TRAIN_PATH)
        self.val_dataset = CustomDataset(VALID_PATH)
        self.train_loader = data_utils.DataLoader(dataset=self.train_dataset,
                                                  batch_size=BATCH_SIZE, shuffle=True)
        self.val_loader = data_utils.DataLoader(dataset=self.val_dataset,
                                                  batch_size=BATCH_SIZE, shuffle=False)

        self.build_model()

        self.log_params()

        self.train_writer = SummaryWriter('./logs/train')
        self.valid_writer = SummaryWriter('./logs/valid')
        self.train_writer.add_text('LEARNING_RATE', str(LEARNING_RATE))
        self.train_writer.add_text('BATCH_SIZE', str(BATCH_SIZE))

        return

    def build_model(self):
        self.network = DeepFMNet(nrof_cat=self.train_dataset.nrof_emb_categories, emb_dim=EMBEDDING_SIZE,
                                 emb_columns=self.train_dataset.embedding_columns,
                                 numeric_columns=self.train_dataset.numeric_columns,
                                 nrof_layers=NROF_LAYERS, nrof_neurons=NROF_NEURONS,
                                 output_size=DEEP_OUTPUT_SIZE,
                                 nrof_out_classes=NROF_OUT_CLASSES)

        self.loss = torch.nn.BCEWithLogitsLoss()
        self.accuracy = Accuracy()
        self.optimizer = optim.Adam(self.network.parameters(), lr=LEARNING_RATE)

        return

    def log_params(self):
        return

    def load_model(self, restore_path=''):
        if restore_path == '':
            self.step = 0
        else:
            pass

        return

    def run_train(self):
        print('Run train ...')

        self.load_model()

        for epoch in range(EPOCHS):
            self.network.train()

            for features, label in self.train_loader:
                # Reset gradients
                self.optimizer.zero_grad()

                output = self.network(features)
                
                # Calculate error and backpropagate
                loss = self.loss(output, torch.tensor(label, dtype=torch.float32))

                output = torch.sigmoid(output)

                loss.backward()
                acc = self.accuracy(output, label).item()

                # Update weights with gradients
                self.optimizer.step()

                self.train_writer.add_scalar('CrossEntropyLoss', loss, self.step)
                self.train_writer.add_scalar('Accuracy', acc, self.step)

                self.step += 1

                if self.step % 50 == 0:
                    print('EPOCH %d STEP %d : train_loss: %f train_acc: %f' %
                          (epoch, self.step, loss.item(), acc))

            #self.train_writer.add_histogram('hidden_layer', self.network.linear1.weight.data, self.step)
            #self.train_writer.add_histogram('hidden_layer', model.linear1.weight.data, self.step)

            # Run validation
            running_loss = []
            valid_scores = []
            valid_labels = []
            self.network.eval()
            with torch.no_grad():
                for features, label in self.val_loader:
                    output = self.network(features)
                    # Calculate error and backpropagate
                    loss = self.loss(output, torch.tensor(label, dtype=torch.float32))

                    running_loss.append(loss.item())
                    output = torch.sigmoid(output)
                    valid_scores.extend(output)
                    
                    valid_labels.extend(label)

            valid_accuracy = self.accuracy(torch.tensor(valid_scores), torch.tensor(valid_labels)).item()
            self.valid_writer.add_scalar('CrossEntropyLoss', np.mean(running_loss), self.step)
            self.valid_writer.add_scalar('Accuracy', valid_accuracy, self.step)
            
            print('EPOCH %d : valid_loss: %f valid_acc: %f' % (epoch, np.mean(running_loss), valid_accuracy))
        return

In [11]:
rm -rf /content/tboard_logs

In [12]:
deep_fm = DeepFM()
deep_fm.run_train()

Run train ...


  first_order_embd_output = self.first_order_embd[col](torch.tensor(input_data[self.emb_columns[i]], dtype=torch.int64))
  [first_order_embd_output, self.first_order_embd[col](torch.tensor(input_data[self.emb_columns[i]], dtype=torch.int64))],
  torch.tensor(input_data[self.emb_columns[i]], dtype=torch.int64))
  self.second_order_embd[col](torch.tensor(input_data[self.emb_columns[i]], dtype=torch.int64))],


EPOCH 0 STEP 50 : train_loss: 0.486927 train_acc: 0.783784




EPOCH 0 : valid_loss: 0.461073 valid_acc: 0.800913
EPOCH 1 STEP 100 : train_loss: 0.335166 train_acc: 0.845946
EPOCH 1 : valid_loss: 0.352392 valid_acc: 0.842934
EPOCH 2 STEP 150 : train_loss: 0.320412 train_acc: 0.851351
EPOCH 2 : valid_loss: 0.325733 valid_acc: 0.855052
EPOCH 3 STEP 200 : train_loss: 0.332980 train_acc: 0.845946
EPOCH 3 : valid_loss: 0.318405 valid_acc: 0.858829
EPOCH 4 STEP 250 : train_loss: 0.299547 train_acc: 0.862162
EPOCH 4 : valid_loss: 0.314936 valid_acc: 0.859144
EPOCH 5 STEP 300 : train_loss: 0.351934 train_acc: 0.829730
EPOCH 5 : valid_loss: 0.312697 valid_acc: 0.860403
EPOCH 6 STEP 350 : train_loss: 0.311208 train_acc: 0.851351
EPOCH 6 : valid_loss: 0.311219 valid_acc: 0.860403
EPOCH 7 STEP 400 : train_loss: 0.307016 train_acc: 0.862162
EPOCH 7 : valid_loss: 0.310168 valid_acc: 0.864023
EPOCH 8 STEP 450 : train_loss: 0.312038 train_acc: 0.867568
EPOCH 8 : valid_loss: 0.310095 valid_acc: 0.862921
EPOCH 9 STEP 500 : train_loss: 0.279862 train_acc: 0.867568
E

In [14]:
#%tensorboard --logdir /content/logs