In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras import Model
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPU


In [None]:
tf.device('/device:GPU:2')

<tensorflow.python.eager.context._EagerDeviceContext at 0x7f7bb5062e40>

In [None]:
data_df = pd.read_csv('label_preprocessed.csv')
data_df = data_df.drop(['URL'], axis=1)

In [None]:
data_df

Unnamed: 0,ExpPeriod,Salary,Age,Gender,WorkType,WorkSchedule,N_places,LastWorkDesc,LastWorkDesc2,Top 10 work,...,Description,Python,SQL,Power BI,N_Unis,Top 10 Uni,top_city,EduLevelCat,DataAnalyst,DataScientist
0,-1.048106,-0.032825,-0.851000,2.0,1.0,0.0,-0.311920,работ проект image реcогнитионобеспечен полнот...,работ отдел nka бизнес детск питанияформирован...,0.0,...,academic ielts 80,1.0,1.0,1.0,3.785271,0.0,1.0,2.0,1.0,0.0
1,-1.216159,-0.032825,-1.144948,2.0,1.0,0.0,-0.974429,доработк оптимизац сqлзапрос сбор витрин дан,работ big datapythonsqldbt yandex datalens exc...,0.0,...,имеет оп создан андроидприложен kotlin вебсерв...,1.0,1.0,0.0,-0.692166,0.0,1.0,1.0,1.0,0.0
2,-0.593374,-0.032825,0.104330,2.0,1.0,0.0,0.350588,работ аналитическ отчетностьюосновн этап работ...,администрирован систем управлен звонкамиразраб...,0.0,...,ключев знан навыкикачествен анализ дан разрабо...,1.0,1.0,1.0,-0.692166,0.0,0.0,1.0,1.0,1.0
3,-0.375894,-0.032825,-0.557052,3.0,1.0,0.0,-0.643175,девелопмент техподдержк bi сист приложен платф...,расчет автоматизац расчет заработн плат прем л...,0.0,...,энергичн быстр обучаем открыт хобб спорт путеш...,0.0,1.0,1.0,0.427193,0.0,1.0,2.0,1.0,0.0
4,-1.067877,-0.032825,-0.924487,2.0,1.0,0.0,-0.643175,проведен исследован отток клиент создан витрин...,выполнен adhoc запрос руководств подготовк отч...,0.0,...,отвеча мессенджер почтуинтерес анализ дан им о...,1.0,1.0,1.0,1.546553,0.0,1.0,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9557,0.167807,-0.032825,-0.263104,0.0,1.0,0.0,0.350588,написан бизнестребован согласн поставлен задач...,формирован ежедневн ежемесячн ежеквартальн отч...,0.0,...,семейн положен женат дет жела график работ 52,0.0,0.0,0.0,-0.692166,0.0,1.0,1.0,0.0,0.0
9558,0.909217,-0.032825,-0.263104,1.0,1.0,0.0,-1.305683,анализ сведен,,0.0,...,коммуникабельн стрессоустойчив работоспособн,0.0,0.0,0.0,-0.692166,0.0,1.0,1.0,0.0,0.0
9559,-0.148528,-0.032825,-0.263104,1.0,1.0,0.0,0.350588,обязанностиобработк входя звонк сообщен кандид...,мо обязан вход ежедневн обработк обзвон отклик...,0.0,...,коммуникабельн гибк целеустремлен исполнительн...,0.0,0.0,0.0,-0.692166,0.0,0.0,1.0,0.0,0.0
9560,1.156354,-0.032825,0.471765,1.0,1.0,0.0,1.013097,• подготовк отчетн мсфо иностра компан юрисдик...,организац работ подготовк управленческ отчетн ...,0.0,...,зна участк работ универсальн коммерческ банк в...,0.0,0.0,0.0,0.427193,0.0,1.0,2.0,0.0,0.0


# Preparing Data Text - Unbalanced

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

In [None]:
def concating_features(features, df):
  sen_w_feats = []
  df = df.fillna("")

  for (i, row) in df.iterrows():
    combined = ''

    for col_name in features:
      combined += (str(row[col_name]))
    
    sen_w_feats.append(combined)

  df['text'] = sen_w_feats
  df = df.drop(features, axis=1)
  return df

In [None]:
X = data_df.drop(['DataAnalyst', 'DataScientist'], axis=1)
y = data_df[['DataAnalyst', 'DataScientist']]

In [None]:
text_features = ['Description', 'LastWorkDesc', 'LastWorkDesc2']

In [None]:
data_full = concating_features(text_features, data_df)

In [None]:
X = concating_features(text_features, X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [None]:
tokenizer.fit_on_texts(X_train['text'].values)

In [None]:
X_train_text = tokenizer.texts_to_sequences(X_train['text'].values)
X_train_text = tf.keras.utils.pad_sequences(X_train_text, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_train_text.shape)

Shape of data tensor: (7171, 250)


In [None]:
X_test_text = tokenizer.texts_to_sequences(X_test['text'].values)
X_test_text = tf.keras.utils.pad_sequences(X_test_text, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_test_text.shape)

Shape of data tensor: (2391, 250)


In [None]:
class_weights_da = class_weight.compute_class_weight(class_weight='balanced', classes = np.unique(y_train['DataAnalyst'].values), y=y_train['DataAnalyst'].values)
class_weights_ds = class_weight.compute_class_weight(class_weight='balanced', classes = np.unique(y_train['DataScientist'].values), y=y_train['DataScientist'].values)

In [None]:
y_da_train = tf.convert_to_tensor(y_train['DataAnalyst'].values, dtype=tf.int64)
y_ds_train = tf.convert_to_tensor(y_train['DataScientist'].values)

y_da_test = tf.convert_to_tensor(y_test['DataAnalyst'].values, dtype=tf.int64)
y_ds_test = tf.convert_to_tensor(y_test['DataScientist'].values)

In [None]:
X_num_train = tf.convert_to_tensor(X_train.drop(['text'], axis=1).values, dtype=tf.float64)
X_num_test = tf.convert_to_tensor(X_test.drop(['text'], axis=1).values, dtype=tf.float64)

# Undersampling

In [None]:
df_da_1 = data_full[data_full['DataAnalyst']==1]
df_da_0 = data_full[data_full['DataAnalyst']==0]
df_da_0 = df_da_0.sample(len(df_da_1))

df_da_undersampling = df_da_0.append(df_da_1)
df_da_undersampling = df_da_undersampling.sample(frac=1).reset_index(drop=True)
df_da_undersampling = df_da_undersampling.drop(['DataScientist'], axis=1)

X_da_undersampling, y_da_undersampling = df_da_undersampling.drop(['DataAnalyst'], axis=1), df_da_undersampling['DataAnalyst']

  df_da_undersampling = df_da_0.append(df_da_1)


In [None]:
df_ds_1 = data_full[data_full['DataScientist']==1]
df_ds_0 = data_full[data_full['DataScientist']==0]
df_ds_0 = df_ds_0.sample(len(df_ds_1))

df_ds_undersampling = df_ds_0.append(df_ds_1)
df_ds_undersampling = df_ds_undersampling.sample(frac=1).reset_index(drop=True)
df_ds_undersampling = df_ds_undersampling.drop(['DataAnalyst'], axis=1)

X_ds_undersampling, y_ds_undersampling = df_ds_undersampling.drop(['DataScientist'], axis=1), df_ds_undersampling['DataScientist']

  df_ds_undersampling = df_ds_0.append(df_ds_1)


In [None]:
X_full_text = tokenizer.texts_to_sequences(data_full['text'].values)
X_full_text = tf.keras.utils.pad_sequences(X_full_text, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_full_text.shape)

Shape of data tensor: (9562, 250)


In [None]:
X_da_text_under = tokenizer.texts_to_sequences(X_da_undersampling['text'].values)
X_da_text_under = tf.keras.utils.pad_sequences(X_da_text_under, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_da_text_under.shape)

Shape of data tensor: (1374, 250)


In [None]:
X_ds_text_under = tokenizer.texts_to_sequences(X_ds_undersampling['text'].values)
X_ds_text_under = tf.keras.utils.pad_sequences(X_ds_text_under, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_ds_text_under.shape)

Shape of data tensor: (222, 250)


In [None]:
y_da_undersampling = tf.convert_to_tensor(y_da_undersampling, dtype=tf.int64)
y_ds_undersampling = tf.convert_to_tensor(y_ds_undersampling, dtype=tf.int64)

y_full_da = tf.convert_to_tensor(data_full['DataAnalyst'].values, dtype=tf.int64)
y_full_ds = tf.convert_to_tensor(data_full['DataScientist'].values)

In [None]:
X_da_num_under = tf.convert_to_tensor(X_da_undersampling.drop(['text'], axis=1).values, dtype=tf.float64)
X_ds_num_under = tf.convert_to_tensor(X_ds_undersampling.drop(['text'], axis=1).values, dtype=tf.float64)
X_num_full = tf.convert_to_tensor(data_full.drop(['text', 'DataAnalyst', 'DataScientist'], axis=1).values, dtype=tf.float64)

# Model

In [None]:

nlp_input = Input(shape=(250, )) 
meta_input = Input(shape=(21,))
emb = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train_text.shape[1])(nlp_input) 
nlp_out = Bidirectional(LSTM(128))(emb) 
concat = tf.keras.layers.Concatenate(axis=1)([nlp_out, meta_input]) 
classifier = Dense(32, activation='relu')(concat) 
output = Dense(1, activation='sigmoid')(classifier) 
model = Model(inputs=[nlp_input , meta_input], outputs=[output])

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
    loss='binary_crossentropy',
    metrics=METRICS)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 250)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 250, 100)     5000000     ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 256)          234496      ['embedding[0][0]']              
                                                                                                  
 input_2 (InputLayer)           [(None, 21)]         0           []                               
                                                                                              

## Unbalanced

In [None]:
model.fit([X_train_text, X_num_train], y_da_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7b18f3e620>

In [None]:
y_pred_da = model.predict([X_test_text, X_num_test])



In [None]:
y_pred_da = y_pred_da.flatten()

In [None]:
check_da = np.where(y_pred_da > 0.5, 1, 0)

In [None]:
accuracy_score(check_da, y_da_test)

0.9268088665830196

In [None]:
f1_score(check_da, y_da_test)

0.3346007604562738

In [None]:
print(precision_score(check_da, y_da_test))
print(recall_score(check_da,y_da_test ))

0.23655913978494625
0.5714285714285714


In [None]:
model.fit([X_train_text, X_num_train], y_ds_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7b1ff67190>

In [None]:
y_pred_ds = model.predict([X_test_text, X_num_test])
y_pred_ds = y_pred_da.flatten()



In [None]:
check_ds = np.where(y_pred_da > 0.5, 1, 0)

In [None]:
accuracy_score(check_ds, y_ds_test)

0.9657047260560435

In [None]:
f1_score(check_ds, y_ds_test)

0.22641509433962265

In [None]:
print(precision_score(check_ds, y_ds_test))
print(recall_score(check_ds,y_ds_test ))

0.41379310344827586
0.15584415584415584


## Undersampling

In [None]:
model.fit([X_da_text_under, X_da_num_under], y_da_undersampling, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7b1f3361a0>

In [None]:
y_pred_da_under = model.predict([X_full_text, X_num_full])



In [None]:
y_pred_da_under = y_pred_da_under.flatten()
check_da_under = np.where(y_pred_da_under > 0.5, 1, 0)

In [None]:
accuracy_score(check_da_under, y_full_da)

0.8985567872829952

In [None]:
f1_score(check_da_under, y_full_da)

0.5858240819812126

In [None]:
print(precision_score(check_da_under, y_full_da))
print(recall_score(check_da_under, y_full_da))

0.9985443959243085
0.41450151057401813


In [None]:
model.fit([X_ds_text_under, X_ds_num_under], y_ds_undersampling, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7b1f335c30>

In [None]:
y_pred_ds_under = model.predict([X_full_text, X_num_full])



In [None]:
y_pred_ds_under = y_pred_ds_under.flatten()
check_ds_under = np.where(y_pred_ds_under > 0.5, 1, 0)

In [None]:
accuracy_score(check_ds_under, y_full_ds)

0.9188454298263962

In [None]:
f1_score(check_ds_under, y_full_ds)

0.22244488977955915

In [None]:
print(precision_score(check_ds_under, y_full_ds))
print(recall_score(check_ds_under, y_full_ds))

1.0
0.12514092446448705


## Oversampling

In [None]:
X_vectorized = np.concatenate((X_train_text, X_train.drop(['text'], axis=1).values), axis=1)

In [None]:
X_vectorized.shape

(7171, 271)

In [None]:
sm = SMOTE(random_state=42)
X_da_oversampling, y_da_oversampling = sm.fit_resample(X_vectorized, y_da_train)
X_ds_oversampling, y_ds_oversampling = sm.fit_resample(X_vectorized, y_ds_train)

In [None]:
X_da_over_text = np.array([ele[:-21] for ele in X_da_oversampling])
X_ds_over_text = np.array([ele[:-21] for ele in X_ds_oversampling])

X_da_over_num = np.array([ele[250:] for ele in X_da_oversampling])
X_ds_over_num = np.array([ele[250:] for ele in X_ds_oversampling])

In [None]:
model.fit([X_da_over_text, X_da_over_num], y_da_oversampling, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7b1f3cd3c0>

In [None]:
y_pred_da_over = model.predict([X_test_text, X_num_test])
y_pred_da_over = y_pred_da_over.flatten()
check_da_over = np.where(y_pred_da_over > 0.5, 1, 0)



In [None]:
check_da_over

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
accuracy_score(check_da_over, y_da_test)

0.8578000836470097

In [None]:
f1_score(check_da_over, y_da_test)

0.43521594684385384

In [None]:
print(precision_score(check_da_over, y_da_test))
print(recall_score(check_da_over, y_da_test))

0.7043010752688172
0.31490384615384615


In [None]:
model.fit([X_ds_over_text, X_ds_over_num], y_ds_oversampling, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7b1f3cee90>

In [None]:
y_pred_ds_over = model.predict([X_test_text, X_num_test])
y_pred_ds_over = y_pred_ds_over.flatten()
check_ds_over = np.where(y_pred_ds_over > 0.5, 1, 0)



In [None]:
accuracy_score(check_ds_over, y_ds_test)

0.9167712254286909

In [None]:
f1_score(check_ds_over, y_ds_test)

0.11555555555555556

In [None]:
print(precision_score(check_ds_over, y_ds_test))
print(recall_score(check_ds_over, y_ds_test))

0.4482758620689655
0.0663265306122449
