In [1]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, LSTM
from keras.layers.core import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras import metrics, regularizers
from keras.preprocessing import sequence
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#Load cleaned dataset
data = pd.read_csv('short_dataset_final.csv', header = 0, names = ['Job_Title', 'Description'])
#data = pd.read_csv('../../Results/Cleaned_JobsNonIT.csv', header = 0, names = ['Query', 'Description'])
data.columns

Index(['Job_Title', 'Description'], dtype='object')

In [3]:
data


Unnamed: 0,Job_Title,Description
0,Data Scientist,month
1,Data Scientist,study transform data science prototype seeking...
2,Data Scientist,work data engineer deploy model build data pip...
3,Data Scientist,data wrangling data cleansing experience uk wo...
4,Data Scientist,identifying pattern trend large data set suppo...
...,...,...
10495,UI UX Designer,joining team including senior developer ux ui ...
10496,UI UX Designer,boeing company currently seeking high performi...
10497,UI UX Designer,passionate creating highly functional beautifu...
10498,UI UX Designer,patientmd integrated mobile platform informati...


In [4]:
#Split the dataset to Training and Test subsets (90/10)
train, test = train_test_split(data, test_size = 0.1, random_state = 17) #random_state = None

train_descs = train['Description']
train_labels = train['Job_Title']
 
test_descs = test['Description']
test_labels = test['Job_Title']

In [5]:
total_jobs=pd.read_csv('job_list_final.csv',header=0,names=['Job_Titles'])

In [6]:
total_jobs

Unnamed: 0,Job_Titles
0,Data Scientist
1,Data Analyst
2,Database Administrator
3,Business Analyst
4,Machine Learning
5,Artificial Intelligence
6,Deep Learning
7,Big Data Engineer
8,Cloud Services Developer
9,Full Stack Developer


In [14]:
# Model Parameters
vocab_size = 1000

sequences_length = 1200

embedding_dimensionality = 64 #possibly low??
max_features = 2000 #equal to vocab_size

num_labels = len(train_labels.unique())
batch_size = 32
nb_epoch = 20

nof_filters = 200
kernel_size = 16

hidden_dims = 512

In [15]:
# Convert Texts to Numeric Vectors for Input
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(train_descs)

x_train = tokenizer.texts_to_sequences(train_descs)
x_test = tokenizer.texts_to_sequences(test_descs)

x_train = sequence.pad_sequences(x_train, maxlen = sequences_length, padding = 'post')
x_test = sequence.pad_sequences(x_test, maxlen = sequences_length, padding = 'post')

encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

In [16]:
model = Sequential()
model.add(Embedding(max_features, embedding_dimensionality, input_length = 1200))
#model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

model.add(Conv1D(nof_filters, kernel_size, padding='valid', activation='relu', strides = 1))
model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.3))
model.add(Activation('relu'))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', #'sgd', 'adam', 'RMSprop', 'Adagrad'
                   metrics = [metrics.categorical_accuracy])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1200, 64)          128000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1185, 200)         205000    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               102912    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
activation_1 (Activat

In [17]:
history = model.fit(x_train, y_train,
                    batch_size = batch_size,
                    epochs = nb_epoch,
                    verbose = True,
                    validation_split = 0.2)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 7560 samples, validate on 1890 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:
score = model.evaluate(x_test, y_test, batch_size = batch_size, verbose = True)
 
print('\nTest categorical_crossentropy:', score[0])
print('Categorical accuracy:', score[1])



Test categorical_crossentropy: 1.5374882443745932
Categorical accuracy: 0.7247619049889701


In [11]:
# summarize history for accuracy
import matplotlib.pyplot as plt

plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('classification accuracy')
plt.xlabel('epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

<Figure size 640x480 with 1 Axes>

In [19]:
model.save('recommend.h5')

In [20]:
from keras.models import load_model
#see here----------------------------
text_model=load_model('recommend.h5')

In [63]:
x_test

array([[808,  50, 173, ...,   0,   0,   0],
       [  4,   9,  80, ...,   0,   0,   0],
       [ 29, 958,  29, ...,   0,   0,   0],
       ...,
       [547,  38,  49, ...,   0,   0,   0],
       [764, 343, 105, ...,   0,   0,   0],
       [661,  42, 254, ...,   0,   0,   0]])

In [64]:
res=model.predict(x_test)

In [28]:
res

array([[1.6886569e-09, 5.8988559e-07, 4.0312923e-04, ..., 7.1793896e-05,
        4.4700514e-06, 6.1989781e-03],
       [1.5714272e-06, 4.9058631e-06, 2.0337593e-05, ..., 2.0316073e-09,
        2.1142739e-06, 4.9302521e-08],
       [2.8384868e-05, 1.4584553e-08, 7.9045464e-07, ..., 6.6800858e-05,
        1.0763492e-05, 1.8184998e-09],
       ...,
       [9.1106653e-05, 5.9715647e-05, 1.6784282e-06, ..., 9.5531675e-07,
        1.0373650e-05, 5.1941900e-09],
       [3.4031687e-05, 9.4583791e-01, 1.3826541e-10, ..., 1.4943292e-07,
        8.4634331e-07, 3.4342150e-04],
       [8.7444996e-03, 1.3082624e-05, 5.0020446e-08, ..., 5.8264559e-06,
        1.7407619e-06, 1.3709583e-05]], dtype=float32)

In [None]:
#---------------------see from here----------------

In [38]:
query=['machine learning']

In [39]:
query = tokenizer.texts_to_sequences(query)

#x_train = sequence.pad_sequences(x_train, maxlen = sequences_length, padding = 'post')
query = sequence.pad_sequences(query, maxlen = sequences_length, padding = 'post')

#encoder = LabelBinarizer()
#encoder.fit(train_labels)
#y_train = encoder.transform(train_labels)
#y_test = encoder.transform(test_labels)

In [40]:
res=model.predict(query)

In [41]:
len(res)

1

In [42]:
res

array([[0.10209953, 0.01700058, 0.0166914 , 0.01403608, 0.01123492,
        0.0252146 , 0.03481248, 0.06165478, 0.02760767, 0.14033528,
        0.01925051, 0.04019922, 0.02152535, 0.30851576, 0.07276127,
        0.01825023, 0.01392811, 0.01866319, 0.01753668, 0.00661737,
        0.0120649 ]], dtype=float32)

In [26]:
train_titles=list(train_labels)
title_indices=y_train.argmax(axis=-1)               
title_indices=list(title_indices)                  

In [27]:
result=[]
for i in range(0,22):           #this model had 22 jobs 
    for j in range(len(title_indices)):
        if(title_indices[j])==i:
            result.append(train_titles[j])
            break

In [28]:
result

['Artificial Intelligence',
 'Big Data Engineer',
 'Business Analyst',
 'Civil Engineer',
 'Cloud Services Developer',
 'Cybersecurity Engineer',
 'Data Analyst',
 'Data Scientist',
 'Database Administrator',
 'Deep Learning',
 'Electrical Engineer',
 'Electronics Engineer',
 'Full Stack Developer',
 'Machine Learning',
 'Mechanical Engineer',
 'Mechatronics Engineer',
 'Network Architect',
 'Robotics',
 'Software Developer',
 'UI UX Designer',
 'VLSI Engineer']

In [43]:
pred_array=[]
for l in range(len(res)):
    indices=[]
    pqr=list(res[l])
    for j in range(0,3):
        i=pqr.index(max(pqr))
        indices.append(result[i])
        pqr[i]=0;
    pred_array.append(indices)    

In [44]:
pred_array

[['Machine Learning', 'Deep Learning', 'Artificial Intelligence']]

In [None]:
#--------------------------------------till here--------------------------