In [0]:
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision torchtext
import torch

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
!python -m spacy download en

#####################################################
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os

######################################################

!python -m spacy download en
!pip install msgpack==0.5.6
!pip install spacy==2.0.0

# WHAT YOU NEED TO DO
# This piece of code will install everything to colab. However, you need to upload the csv files to your drive
# to the My Drive/Colab Notebooks/data/' path. When you run this part, it will ask you to connect your own google drive
# so you will need to give access to it - it seems i can't share my drive through Colab unfortunately

# import torch.backends.cudnn as cudnn
# cudnn.enabled = False

Mounted at /content/gdrive
Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 69.5MB/s 

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 58.5MB/s 

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6

In [0]:
"""
Data pre-processing file
"""

import pandas as pd
import numpy as np
import os
from torchtext import data, vocab
import torch
import spacy
from random import randint

spacy_en = spacy.load('en')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

class TalentFox:
    """
    Class to handle the TalentFox data

    Predict:
    match_status

    Columns for candidate:
    candidate_city, candidate_state, candidate_country, candidate_title, candidate_birth_date,
    candidate_current_fixed_salary, candidate_current_bonus_salary, candidate_in_job_market_since,
    candidate_other_languages, candidate_is_looking_for_new_job, candidate_wish_2, candidate_wish_3, candidate_wish_1,
    candidate_education, candidate_language_negotiative, candidate_language_basic, candidate_language_fluent,
    candidate_highest_degree, candidate_career_type, candidate_industries, candidate_professions, candidate_resume,
    candidate_feedback, candidate_professions_global, candidate_industries_global, candidate_relocation_ready,

    Columns for job:
    job_fixed_salary, job_bonus_salary, job_title, job_vacation_days, job_needed_experience, job_language,
    job_description, job_daily_tasks_of_job, job_required_experience_of_candidate,
    job_preferred_experience_of_candidate, job_preferred_education_of_candidate, job_max_candidate_age,
    job_min_candidate_age, job_company_structure, job_language_skills_negotiative, job_language_skills_basic,
    job_candidate_radius, job_candidate_relocation, job_city, job_state, job_country, job_time_model, job_max_salary,
    job_questions_for_candidate, match_employer_feedback
    """

    def __init__(self, batch_size=100):
        print('Device: ' + str(device))

        self.candidate_title = data.Field(sequential=True, lower=True, tokenize=tokenizer, include_lengths=True, use_vocab=True)
        self.candidate_resume = data.Field(sequential=True, lower=True, tokenize=tokenizer, include_lengths=True, use_vocab=True)
        self.job_title = data.Field(sequential=True, lower=True, tokenize=tokenizer, include_lengths=True, use_vocab=True)
        self.job_description = data.Field(sequential=True, lower=True, tokenize=tokenizer, include_lengths=True, use_vocab=True)
        self.match_status = data.Field(sequential=False, use_vocab=False)

        self.train_set, self.validation_set = data.TabularDataset.splits(
            path='./gdrive/My Drive/Colab Notebooks/data/TalentFox/',
            train='train_data.csv',
            validation='val_data.csv',
            format='csv',
            fields=[
                ('index', None),
                ('job_title', self.job_title),
                ('job_description', self.job_description),
                ('candidate_title', self.candidate_title),
                ('candidate_resume', self.candidate_resume),
                ('match_status', self.match_status)
            ],
            skip_header=True,
        )

        self.train_iter, self.validation_iter = data.BucketIterator.splits(
            (self.train_set, self.validation_set),
            batch_size=batch_size,
            shuffle=True,
            device=device,
            sort_key=lambda x: len(x.job_title),
            sort_within_batch=True,
            repeat=True)

        self.match_status.build_vocab(self.train_set)
        url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.de.vec'
        self.job_title.build_vocab(self.train_set, vectors=vocab.Vectors('wiki.de.vec', url=url))
        self.job_description.build_vocab(self.train_set, vectors=vocab.Vectors('wiki.de.vec', url=url))
        self.candidate_title.build_vocab(self.train_set, vectors=vocab.Vectors('wiki.de.vec', url=url))
        self.candidate_resume.build_vocab(self.train_set, vectors=vocab.Vectors('wiki.de.vec', url=url))
        
STOP_WORDS = {'(', ')', '/', 'm', 'w', '-', ' ', '.', '\t'}

def tokenizer(text):  # create a tokenizer function
    tokens = [tok.text for tok in spacy_en.tokenizer(text)]
    tokens = list(filter(lambda token: token not in STOP_WORDS, tokens))
    return tokens

In [0]:
"""
Model file
"""
import torch
import torch.nn as nn
import numpy as np
import torch.nn.utils.rnn as rnn

max_rating = 5.0
min_rating = 0.5

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

class TalentNetExperimental(nn.Module):
    def __init__(self, job_title, job_description, candidate_title, candidate_resume, p1=0.2, p2=0.2, p3=0.2):
        super(TalentNetExperimental, self).__init__()
        self.job_title_vectors = job_title.vocab.vectors
        self.job_title_num_embeddings = self.job_title_vectors.size()[0]
        self.job_title_embedding_dim = self.job_title_vectors.size()[1]
        
        self.job_description_vectors = job_description.vocab.vectors
        self.job_description_num_embeddings = self.job_description_vectors.size()[0]
        self.job_description_embedding_dim = self.job_description_vectors.size()[1]

        self.candidate_title_vectors = candidate_title.vocab.vectors
        self.candidate_title_num_embeddings = self.candidate_title_vectors.size()[0]
        self.candidate_title_embedding_dim = self.candidate_title_vectors.size()[1]

        self.candidate_resume_vectors = candidate_resume.vocab.vectors
        self.candidate_resume_num_embeddings = self.candidate_resume_vectors.size()[0]
        self.candidate_resume_embedding_dim = self.candidate_resume_vectors.size()[1]

        self.job_title_embeddings = nn.Embedding(self.job_title_num_embeddings, self.job_title_embedding_dim)
        self.job_title_embeddings.weight.data.copy_(self.job_title_vectors)

        self.job_description_embeddings = nn.Embedding(self.job_description_num_embeddings, self.job_description_embedding_dim)
        self.job_description_embeddings.weight.data.copy_(self.job_description_vectors)

        self.candidate_title_embeddings = nn.Embedding(self.candidate_title_num_embeddings, self.candidate_title_embedding_dim)
        self.candidate_title_embeddings.weight.data.copy_(self.candidate_title_vectors)

        self.candidate_resume_embeddings = nn.Embedding(self.candidate_resume_num_embeddings, self.candidate_resume_embedding_dim)
        self.candidate_resume_embeddings.weight.data.copy_(self.candidate_resume_vectors)

        self.lin1 = nn.Sequential(
            nn.Dropout(p1),
            nn.Linear(1200, 400),
            nn.ReLU(),
        )

        self.lin2 = nn.Sequential(
            nn.Dropout(p2),
            nn.Linear(400, 100),
            nn.ReLU(),
        )

        self.lin3 = nn.Sequential(
            nn.Dropout(p3),
            nn.Linear(100, 1),
            nn.ReLU(),
        )

    def forward(self, data):
        job_title = data.job_title
        job_description = data.job_description
        candidate_title = data.candidate_title
        candidate_resume = data.candidate_resume

        numpy_job = job_title.cpu().data.numpy()
        num_non_ones = np.count_nonzero(np.subtract(numpy_job, np.ones(numpy_job.shape)), axis=0)
        num_non_ones = np.repeat(np.expand_dims(num_non_ones, 1), self.job_title_embedding_dim, 1)
        num_non_ones = torch.tensor(num_non_ones).to(device).float()

        job_title = self.job_title_embeddings(job_title)
        job_title = torch.sum(job_title, 0).to(device) / num_non_ones

        numpy_job = job_description.cpu().data.numpy()
        num_non_ones = np.count_nonzero(np.subtract(numpy_job, np.ones(numpy_job.shape)), axis=0)
        num_non_ones = np.repeat(np.expand_dims(num_non_ones, 1), self.job_description_embedding_dim, 1)
        num_non_ones = torch.tensor(num_non_ones).to(device).float()

        job_description = self.job_description_embeddings(job_description)
        job_description = torch.sum(job_description, 0).to(device) / num_non_ones

        numpy_candidate = candidate_title.cpu().data.numpy()
        num_non_ones = np.count_nonzero(np.subtract(numpy_candidate, np.ones(numpy_candidate.shape)), axis=0)
        num_non_ones = np.repeat(np.expand_dims(num_non_ones, 1), self.candidate_title_embedding_dim, 1)
        num_non_ones = torch.tensor(num_non_ones).to(device).float()

        candidate_title = self.candidate_title_embeddings(candidate_title)
        candidate_title = torch.sum(candidate_title, 0).to(device) / num_non_ones

        numpy_candidate = candidate_resume.cpu().data.numpy()
        num_non_ones = np.count_nonzero(np.subtract(numpy_candidate, np.ones(numpy_candidate.shape)), axis=0)
        num_non_ones = np.repeat(np.expand_dims(num_non_ones, 1), self.candidate_resume_embedding_dim, 1)
        num_non_ones = torch.tensor(num_non_ones).to(device).float()

        candidate_resume = self.candidate_resume_embeddings(candidate_resume)
        candidate_resume = torch.sum(candidate_resume, 0).to(device) / num_non_ones

        x = (job_title * job_description * candidate_title * candidate_resume).sum(1)

        """
        catted = torch.cat([job_title, job_description, candidate_title, candidate_resume], dim=1)

        x = self.lin1(catted)
        x = self.lin2(x)
        x = self.lin3(x)
        """
        out = torch.sigmoid(x)

        return out


In [0]:
from types import SimpleNamespace
import torch
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
from random import randint

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


def talent_fox_train(train_iter, val_iter, net, optimizer, criterion, ratio, num_epochs=5):
    net.train()
    prev_epoch = 0
    train_loss = []
    train_accs = []
    train_accs_pos = 0
    train_sum = 0
    val_res = []
    for batch in train_iter:
        (job_title, job_title_lengths) = batch.job_title
        (job_description, job_description_lengths) = batch.job_description
        (candidate_title, candidate_title_lengths) = batch.candidate_title
        (candidate_resume, candidate_resume_lengths) = batch.candidate_resume
        match_status = batch.match_status

        net.train()

        batch_sampling = {'job_title': job_title, 'job_description': job_description, 'candidate_title': candidate_title, 'candidate_resume': candidate_resume}
        output = net(SimpleNamespace(**batch_sampling)).reshape(-1)
        targets = match_status.float().to(device)
        criterion.weight = weights(targets, ratio)
        batch_loss = criterion(output, targets)

        train_loss.append(get_numpy(batch_loss))
        train_accs.append(accuracy_sigmoid(output, targets))
        train_accs_pos += accuracy_talent(output, targets)
        train_sum += sum_targets(targets)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()


        if train_iter.epoch != prev_epoch:
            net.eval()
            val_loss, val_accs, val_accs_pos, val_length, val_sum = 0, 0, 0, 0, 0

            for val_batch in val_iter:
                if val_iter.epoch != train_iter.epoch-1:
                    break
                (job_title, job_title_lengths) = val_batch.job_title
                (job_description, job_description_lengths) = val_batch.job_description
                (candidate_title, candidate_title_lengths) = val_batch.candidate_title
                (candidate_resume, candidate_resume_lengths) = val_batch.candidate_resume
                match_status = val_batch.match_status

                batch_sampling = {'job_title': job_title, 'job_description': job_description,
                                  'candidate_title': candidate_title, 'candidate_resume': candidate_resume}
                val_output = net(SimpleNamespace(**batch_sampling)).reshape(-1)
                val_target = match_status.float().to(device)
                val_loss += criterion(val_output, val_target) * val_batch.batch_size
                val_accs += accuracy_sigmoid(val_output, val_target) * val_batch.batch_size
                val_accs_pos += accuracy_talent(val_output, val_target)
                val_length += val_batch.batch_size
                val_sum += sum_targets(val_target)

            val_loss /= val_length
            val_accs /= val_length
            val_res.append(val_accs)

            print(
                "Epoch {}: Train loss: {:.3f},  Train accs total: {:.3f}, Train accs positive: {:.3f}"
                    .format(train_iter.epoch, np.mean(train_loss), 1.0 - np.mean(train_accs), train_accs_pos/train_sum))
            print(
                "          Validation loss: {:.3f}, Validation accs total: {:.3f}, Validation accs positive: {:.3f}"
                    .format(val_loss, 1.0 - val_accs, val_accs_pos/val_sum))
            print()
            train_loss = []
            train_accs = []
            train_accs_pos = 0
            train_sum = 0
            net.train()

        prev_epoch = train_iter.epoch
        if train_iter.epoch == num_epochs:
            break

def negative_sampling(users, docs, num_user):
    if torch.cuda.is_available():
        random_user = torch.tensor(
        [randint(0, num_user) for _ in range(len(users))]
        ).to(device)
    else:
        random_user = torch.tensor(
            [randint(0, num_user-1) for _ in range(len(users))]
        ).to(device)

    author = torch.cat((users, random_user), 0).to(device)
    doc_title = torch.cat((docs, docs), 1).to(device)

    batch_with_negative_sampling = {'user': author, 'doc_title': doc_title}
    return SimpleNamespace(**batch_with_negative_sampling)

def plot_res(train_res, val_res, num_res):
    x_vals = np.arange(num_res)
    plt.figure()
    plt.plot(x_vals, train_res, 'r', x_vals, val_res, 'b')
    plt.legend(['Train Accucary', 'Validation Accuracy'])
    plt.xlabel('Updates'), plt.ylabel('Acc')

def accuracy_one_hot(output, target):
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(output, 1)[1], target)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

def sum_targets(targets):
    return torch.sum(targets)

def accuracy_talent(output, targets):
    correct_predictions = 0
    for idx, val in enumerate(output):
        if val > 0.5 and targets[idx] == torch.tensor(1.0):
            correct_predictions += 1
    return correct_predictions

def accuracy_sigmoid(output, target):
    return torch.mean(torch.abs(output - target).float()).to(device).data.numpy()


def accuracy(output, target):
    return torch.mean(torch.abs(torch.round(output) - target)).to(device).data.numpy()

def weights(target, ratio):
    weight = []
    for val in target:
        if val == torch.tensor(1.):
            weight.append(ratio)
        else:
            weight.append(1.)
    return torch.tensor(weight)

def print_params(net):
    for name, param in net.named_parameters():
        if param.requires_grad:
            print(name, param.data)


def get_numpy(loss):
    return loss.to(device).data.numpy()

In [0]:
from torch import optim, nn

tf = TalentFox(batch_size=100)

train_iter = tf.train_iter
val_iter = tf.validation_iter

job_title = tf.job_title
job_description = tf.job_description
candidate_title = tf.candidate_title
candidate_resume = tf.candidate_resume

ratio = (train_iter.dataset.fields['match_status'].vocab.freqs['0']/train_iter.dataset.fields['match_status'].vocab.freqs['1'])

net = TalentNetExperimental(job_title=job_title, job_description=job_description, candidate_title=candidate_title, candidate_resume=candidate_resume).to(device)
opt = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCELoss()

talent_fox_train(train_iter=train_iter, val_iter=val_iter, net=net, optimizer=opt, criterion=criterion, ratio=ratio, num_epochs=1000)

Device: cpu
Epoch 1: Train loss: 1.373,  Train accs total: 0.470, Train accs positive: 0.909
          Validation loss: 1.701, Validation accs total: 0.472, Validation accs positive: 0.909

Epoch 2: Train loss: 1.299,  Train accs total: 0.483, Train accs positive: 0.967
          Validation loss: 1.081, Validation accs total: 0.487, Validation accs positive: 0.909

Epoch 3: Train loss: 1.270,  Train accs total: 0.499, Train accs positive: 0.950
          Validation loss: 1.227, Validation accs total: 0.498, Validation accs positive: 0.873

Epoch 4: Train loss: 1.231,  Train accs total: 0.511, Train accs positive: 0.942
          Validation loss: 1.233, Validation accs total: 0.515, Validation accs positive: 0.745

Epoch 5: Train loss: 1.191,  Train accs total: 0.532, Train accs positive: 0.880
          Validation loss: 1.083, Validation accs total: 0.542, Validation accs positive: 0.545

Epoch 6: Train loss: 1.147,  Train accs total: 0.557, Train accs positive: 0.880
          Validat