# Loading packages and libraries

In [None]:
%pip install tensorflow_text

In [None]:
%pip install tensorflow_addons

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import tensorflow_addons as tfa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPU


In [None]:
tf.device('/device:GPU:2')

<tensorflow.python.eager.context._EagerDeviceContext at 0x7f2560240e00>

In [None]:
data_df = pd.read_csv('label_raw.csv')

In [None]:
data_df.head()

Unnamed: 0,Title,IsEnglish,SpecCat,ExpPeriod,Salary,Age,Gender,City,WorkType,WorkSchedule,...,EduLevel,N_Unis,LastUni,LastUni2,Top 10 Uni,URL,lastUniFaculty,lastUni2Faculty,DataAnalyst,DataScientist
0,BI аналитик,0.0,Продуктовый аналитик\nМаркетолог-аналитик\nАна...,32.0,,25.0,0.0,Москва,1.0,0.0,...,Высшее образование (Магистр),5.0,"Московский гуманитарно-экономический институт,...",Российский экономический университет им. Г.В. ...,0.0,https://hh.ru/resume/83da3b970003d5d5870039ed1...,"аспирантура, Экономика и управление народным х...","МШБиМЭ, Международные Корпоративные Финансы (н...",1.0,0.0
1,Программист-разработчик,0.0,"Программист, разработчик\n",15.0,,21.0,0.0,Москва,1.0,0.0,...,Высшее образование (Бакалавр),1.0,"Челябинский государственный университет, Челяб...",,0.0,https://hh.ru/resume/8606f37b000773de980039ed1...,"ИИТ, Программная инженерия",,1.0,0.0
2,"Ведущий аналитик, BI-аналитик, Аналитик данных...",0.0,"Программист, разработчик\nBI-аналитик, аналити...",78.0,,38.0,0.0,Пенза,1.0,0.0,...,Высшее образование,1.0,Пензенский Государственный Университет,,0.0,https://hh.ru/resume/e142604300013aa98d0039ed1...,"Факультет вычислительной техники, Специалист п...",,1.0,1.0
3,Разработчик BI,0.0,"Программист, разработчик\n",100.0,170000.0,29.0,1.0,Москва,1.0,0.0,...,Высшее образование (Магистр),2.0,Warsaw School of Economics (Master Degree),Warsaw School of Economics (Bachelor Degree),0.0,https://hh.ru/resume/cb3d9ca600020eb8d00039ed1...,Advanced analytics Big Data,"Finance and accounting, Banking",1.0,0.0
4,Аналитик,0.0,Аналитик\n,30.0,160000.0,24.0,0.0,Москва,1.0,0.0,...,Высшее образование (Магистр),3.0,Гжельский государственный художественно-промыш...,Российская академия народного хозяйства и госу...,0.0,https://hh.ru/resume/ee2448210005c2f0080039ed1...,"Экономика, Экономика",Экономический факультет,1.0,0.0


# Undersampling

In [None]:
df_da_1 = data_df[data_df['DataAnalyst']==1]
df_da_0 = data_df[data_df['DataAnalyst']==0]
df_da_0 = df_da_0.sample(len(df_da_1))

df_da_undersampling = df_da_0.append(df_da_1)
df_da_undersampling = df_da_undersampling.sample(frac=1).reset_index(drop=True)
df_da_undersampling = df_da_undersampling.drop(['DataScientist'], axis=1)

X_da_undersampling, y_da_undersampling = df_da_undersampling.drop(['DataAnalyst'], axis=1), df_da_undersampling['DataAnalyst']

  df_da_undersampling = df_da_0.append(df_da_1)


In [None]:
df_ds_1 = data_df[data_df['DataScientist']==1]
df_ds_0 = data_df[data_df['DataScientist']==0]
df_ds_0 = df_ds_0.sample(len(df_ds_1))

df_ds_undersampling = df_ds_0.append(df_ds_1)
df_ds_undersampling = df_ds_undersampling.sample(frac=1).reset_index(drop=True)
df_ds_undersampling = df_ds_undersampling.drop(['DataAnalyst'], axis=1)

X_ds_undersampling, y_ds_undersampling = df_ds_undersampling.drop(['DataScientist'], axis=1), df_ds_undersampling['DataScientist']

  df_ds_undersampling = df_ds_0.append(df_ds_1)


In [None]:
X_train_da, X_test_da, y_train_da, y_test_da = train_test_split(X_da_undersampling, y_da_undersampling, test_size=0.33)

In [None]:
X_train_ds, X_test_ds, y_train_ds, y_test_ds = train_test_split(X_ds_undersampling, y_ds_undersampling, test_size=0.33)

# Oversampling

In [None]:
X_da, y_da = data_df.drop(['DataAnalyst', 'DataScientist'], axis=1), data_df['DataAnalyst']
X_ds, y_ds = data_df.drop(['DataAnalyst', 'DataScientist'], axis=1), data_df['DataScientist']

# BERT feature extraction

In [None]:
def concating_features(features, df):
  sen_w_feats = []
  df = df.fillna("")

  for (i, row) in df.iterrows():
    combined = ''

    for col_name in features:
      combined += (str(row[col_name]) + ' [SEP] ')
    
    sen_w_feats.append(combined)

  df['text'] = sen_w_feats
  df = df.drop(features, axis=1)
  return df

In [None]:
text_features = ['Description', 'LastWorkDesc', 'LastWorkDesc2']

In [None]:
X_text_ds_undersampling = X_ds_undersampling[text_features]

In [None]:
X_text_ds_undersampling = concating_features(text_features, X_text_ds_undersampling)

In [None]:
def Bert_model(X, y):
  bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
  bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessed_text = bert_preprocess(text_input)
  outputs = bert_encoder(preprocessed_text)

  l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
  l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

  model = tf.keras.Model(inputs=[text_input], outputs = [l])

  METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
  ]

  model.compile(optimizer='adam',
    loss='binary_crossentropy',
    metrics=METRICS)
  
  model.fit(X.text, y, epochs=10)

  return model

In [None]:
model_text_ds = Bert_model(X_text_ds_undersampling, y_ds_undersampling)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
import joblib

filename = "bert_text_ds_model.joblib"
joblib.dump(model_text_ds, filename)

['bert_text_ds_model.joblib']

# Test data

In [None]:
test_data = data_df[text_features]
test_data = concating_features(text_features, test_data)


In [None]:
test_data

Unnamed: 0,text
0,Academic IELTS 8.0 [SEP] Работа над проектом I...
1,Имеется опыт в создании Android-приложений на ...
2,Ключевые знания и навыки:\nКачественный анализ...
3,"Энергичность, быстрая обучаемость, открытость...."
4,ОТВЕЧАЮ ТОЛЬКО В МЕССЕНДЖЕРАХ ИЛИ НА ПОЧТУ!!!\...
...,...
9557,"Семейное положение: женат, есть дети \r\nЖелае..."
9558,"Коммуникабельность, стрессоустойчивость, работ..."
9559,"Коммуникабельная, гибкая, целеустремленная, ис..."
9560,- Знаю участки работы универсального коммерчес...


In [None]:
y_predicted = model_text_ds.predict(test_data.text)
y_predicted = y_predicted.flatten()



In [None]:
check_y = np.where(y_predicted > 0.5, 1, 0)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

print(accuracy_score(check_y, data_df['DataScientist']))

0.6729763647772432


In [None]:
print(f1_score(check_y, data_df['DataScientist']))

0.03338485316846986


In [None]:
print(precision_score(check_y, data_df['DataScientist']))
print(recall_score(check_y, data_df['DataScientist']))

0.4864864864864865
0.017285531370038413


In [None]:
results = pd.read_csv('result.csv')
# results = pd.DataFrame()

In [None]:
results['ds'] = y_predicted

In [None]:
results.to_csv('result.csv', index=False)