In [None]:
%%capture
!pip install wandb
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [119]:
%%capture
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch

cat_features_num = 13 # num of categorical encoded features
input_dim = 312*2 + cat_features_num
output_dim = 1       
learningRate = 7e-4
epochs = 1000
batch_size = 1024


tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
model.cuda()  # uncomment it if you have a GPU

### Bert Embeder

В качестве ембедера текстовых фич мы используем ruBert-Tiny.

In [104]:
def embed_bert_cls(text):
    """
        In:
            text: Text data for embedding
        Out:
            312-dim vector of floats
        """
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()


### Head of Embeeder - fully connected layers

In [105]:
class FullyNet(torch.nn.Module):
    def __init__(self, input_dim, out_dim=1):
        super(FullyNet, self).__init__()
        self.fc_1 = torch.nn.Linear(input_dim, 384)
        self.fc_2 = torch.nn.Linear(384, 256)
        self.fc_3 = torch.nn.Linear(256, 128)
        self.fc_4 = torch.nn.Linear(128, 64)
        self.fc_5 = torch.nn.Linear(64, 8)
        self.fc_6 = torch.nn.Linear(8, out_dim)


    def forward(self, x):
        x = self.fc_1(F.normalize(x))
        x = self.fc_2(F.normalize(x))
        x = self.fc_3(F.normalize(x))
        x = self.fc_4(F.normalize(x))
        x = self.fc_5(F.normalize(x))
        out = self.fc_6(F.normalize(x))
        return out

### Load Data

In [None]:
data = pd.read_csv('/content/drive/MyDrive/skolkovo-hack2022/preprocess_data-5.csv')

In [None]:
x_candidates = data['Candidate_descr'].apply(embed_bert_cls)
x_job = data['Jobs_descr'].apply(embed_bert_cls)

In [None]:
embedded_cat_cols = pd.concat([data['Region_Candidate_listed'],
           data['Region_job_listed']], axis=1).apply(lambda x: list(map(int, x[0][1:-1].split(', '))) + list(map(int, x[1][1:-1].split(', '))), axis = 1)

In [None]:
embedded_text_cols = pd.DataFrame({'Candidate_embed':x_candidates.to_numpy(),
                                  'Jobs_embed':x_job.to_numpy(),
                                  }).apply(lambda x: list(np.concatenate((x[0], x[1]))), axis=1)

In [None]:
embedded_all_cols = pd.DataFrame({'text':embedded_text_cols.to_numpy(),
                                  'cat':embedded_cat_cols.to_numpy()
                                  }).apply(lambda x: list(np.concatenate((x[0], x[1]))), axis=1)

In [106]:
x_data = list(embedded_all_cols.values)
y_data = list(data['target'].values)

x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=42)

In [107]:
def get_data_loader(x, y, batch_size=64):
    tensor_x = torch.Tensor(x)
    tensor_y = torch.Tensor(y)

    dataset = TensorDataset(tensor_x,tensor_y) 
    return DataLoader(dataset, batch_size=batch_size) 


train_dataloader = get_data_loader(x_train, y_train, batch_size=batch_size)
val_dataloader = get_data_loader(x_val, y_val, batch_size=batch_size)

### Build model

In [120]:
fully_model = FullyNet(input_dim, output_dim)

if torch.cuda.is_available():
    fully_model.cuda()


# criterion = torch.nn.L1Loss()
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(fully_model.parameters(), lr=learningRate, momentum=0.9)
# optimizer = torch.optim.Adam(fully_model.parameters(), lr=learningRate)

In [121]:
import wandb

wandb_config = {"epochs": epochs, 
                "batch_size": batch_size, 
                "learning_rate": learningRate}

wandb.init(project="Skolkovo-hack",config = wandb_config)
wandb.watch(fully_model)

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss,█▆▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
mae,█▆▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂▂▁▁▂▂▂▂▂▂▂▂▂▂▂▃▂
mse,█▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▂▂▂▂▂▂▂▂▂▃▂▂▃▃▃▃▄▃
r2,▁▅▆▇████████████████▇█▇▇▇▇▇▇▇▇▇▆▇▇▆▆▆▆▅▆

0,1
loss,42.05586
mae,0.2323
mse,0.09012
r2,-0.01622


[]

### Metrics for validation

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
metric_funcs = [("mse", mean_squared_error),
                ("mae", mean_absolute_error),
                ("r2", r2_score)]


def validate(model, val_dataloader):
    y_pred = []
    y_true = []
    metrics = {}
    for x_val, y_val in val_dataloader:
        out = model(x_val.cuda())
        y_pred += [i for i in out.tolist()]
        y_true += y_val.tolist()
    for (name, mf) in metric_funcs:
        metrics[name] = mf(y_true, y_pred)
    return metrics

In [None]:
validate(fully_model, val_dataloader)

{'mse': 0.10702899989085203,
 'mae': 0.28087368112537964,
 'r2': -0.20694388141736075}

### Train Loop

In [122]:
from tqdm.notebook import tqdm


pbar = tqdm(total=epochs, position=0, leave=True)

for epoch in range(epochs):
    fully_model.train()
    loss = 0
    for inputs, lables in train_dataloader:
        optimizer.zero_grad()

        outputs = fully_model(inputs.cuda())

        batch_loss = criterion(outputs.cuda(), lables.view(-1,1).cuda())
        batch_loss.backward()

        optimizer.step()

        loss += inputs.size(0)*batch_loss.item()

    fully_model.eval()
    metrics = validate(fully_model, val_dataloader)
    
    metrics['loss'] = loss/len(train_dataloader)
    wandb.log(metrics)
    pbar.update(1)
    pbar.set_description("Training {} epoch...Loss:{} MSE:{},  MAE:{}, R2:{}".format(epoch+1, metrics['loss'], metrics["mse"], metrics["mae"], metrics["r2"]))

  0%|          | 0/1000 [00:00<?, ?it/s]

### Save Model

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)

In [None]:
save_model(fully_model, '/content/fully_best_model_FINALY.pt')

### Trash

In [None]:
# def embed_bert_cls(text, bert_tokenizer, bert_model):
#     t = bert_tokenizer(text, padding=True, truncation=True, return_tensors='pt')
#     with torch.no_grad():
#         model_output = bert_model(**{k: v.to(bert_model.device) for k, v in t.items()})
#     embeddings = model_output.last_hidden_state[:, 0, :]
#     embeddings = torch.nn.functional.normalize(embeddings)
#     return embeddings[0].cpu().numpy()


# class FullyNet(torch.nn.Module):
#     def __init__(self, input_dim, out_dim=1):
#         super(FullyNet, self).__init__()
#         self.fc_1 = torch.nn.Linear(input_dim, 384)
#         self.fc_2 = torch.nn.Linear(384, 256)
#         self.fc_3 = torch.nn.Linear(256, 128)
#         self.fc_4 = torch.nn.Linear(128, 64)
#         self.fc_5 = torch.nn.Linear(64, 8)
#         self.fc_6 = torch.nn.Linear(8, out_dim)


#     def forward(self, x):
#         x = self.fc_1(x)
#         x = self.fc_2(x)
#         x = self.fc_3(x)
#         x = self.fc_4(x)
#         x = self.fc_5(x)
#         out = self.fc_6(x)
#         return out


class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.input_size = inputSize
        self.output_size = outputSize
        self.linear = torch.nn.Linear(inputSize, outputSize)


    def forward(self, x):
        out = self.linear(x)
        return out


def make_predict(model, data):
    model.eval()
    return model(data).item()


def df_to_embed_tensor(ser):
    # x_candidates = embed_bert_cls(ser['Candidate_descr'], bert_tokenizer, bert_model)
    # x_job = embed_bert_cls(ser['Jobs_descr'], bert_tokenizer, bert_model)
    x_candidates = embed_bert_cls(ser['Candidate_descr'])
    x_job = embed_bert_cls(ser['Jobs_descr'])

    x = np.concatenate((x_candidates, x_job))
    return torch.Tensor(x).view(1,-1)


def inference(weights_path, data, ModelClass=FullyNet):
    """
        In:
            weights_path - path to model weights .pt
            data - as Series
            ModelClass - class of model (default FullyNet)
    """
    input_dim = 312 * 2
    out_dim = 1

    embed = df_to_embed_tensor(data)

    model = ModelClass(input_dim, out_dim)

    model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu')))

    return make_predict(model, embed)


path = '/content/fully_best_model_FINALY.pt'
df = data.loc[1,:]
inference(path, df, FullyNet)

0.22909753024578094