In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

import seaborn as sns
import re

## Import and Cleaning Dataset

In [2]:
resumeDataSet = pd.read_csv('resume_dataset.csv' ,encoding='utf-8')
resumeDataSet['cleaned_resume'] = ''
resumeDataSet.head()

ParserError: ignored

In [5]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))

## Vectorize Word

In [6]:
requiredText = resumeDataSet['cleaned_resume'].values
requiredTarget = resumeDataSet['Category'].values

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500)
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

print ("Feature completed .....")

X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=0, test_size=0.2)
print(X_train.shape)
print(X_test.shape)

Feature completed .....
(135, 1500)
(34, 1500)


## KNN

In [7]:
clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of KNeighbors Classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

print("\n Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(y_test, prediction)))

Accuracy of KNeighbors Classifier on training set: 0.88
Accuracy of KNeighbors Classifier on test set: 0.79

 Classification report for classifier OneVsRestClassifier(estimator=KNeighborsClassifier()):
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         1
                     Arts       0.00      0.00      0.00         1
       Automation Testing       1.00      0.50      0.67         2
               Blockchain       1.00      1.00      1.00         1
           Civil Engineer       1.00      1.00      1.00         1
             Data Science       1.00      1.00      1.00         3
                 Database       0.50      1.00      0.67         1
         DotNet Developer       1.00      1.00      1.00         4
   Electrical Engineering       1.00      0.33      0.50         3
                   Hadoop       1.00      1.00      1.00         2
       Health and fitness       1.00      0.67      0.80    

In [8]:
import pickle

with open('model_pkl', 'wb') as f:
  pickle.dump(clf, f)

In [9]:
with open('model_pkl', 'rb') as f:
  clf2 = pickle.load(f)

In [10]:
sample = requiredText[30]
sample

' Hard working Quick learnerEducation Details June 2014 to May 2017 LLB LAW Mumbai Maharashtra mumbai university January 2014 B Com Commerce Mumbai Maharashtra Mumbai university January 2011 HSC Maharashtra board January 2009 SSC Maharashtra boardAdvocateSkill Details Company Details company The vidishtra description '

In [11]:
feature = word_vectorizer.transform([sample])

In [12]:
clf.predict(feature)

array(['Arts'], dtype='<U25')

In [13]:
requiredTarget[30]

'Advocate'

In [14]:
with open('vectorizer_pkl', 'wb') as f:
  pickle.dump(word_vectorizer, f)

In [15]:
with open('vectorizer_pkl', 'rb') as f:
  v = pickle.load(f)

In [16]:
clf.predict(v.transform([sample]))[0]

'Arts'

## Model Development using Tensorflow

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GlobalAveragePooling1D

In [18]:
# Ambil data dan encode label
data = resumeDataSet['cleaned_resume']
labels = resumeDataSet['Category']

le = LabelEncoder()
labels = le.fit_transform(labels)

labels = to_categorical(labels, 25)

In [19]:
# split data
training_size = 135

# Split the data
training_data = data[0:training_size]
testing_data = data[training_size:]

# Split the labels
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [20]:
vocab_size = 10000
max_length = 500
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(training_data)
word_index = tokenizer.word_index

# Generate and pad the training sequences
training_sequences = tokenizer.texts_to_sequences(training_data)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Generate and pad the testing sequences
testing_sequences = tokenizer.texts_to_sequences(testing_data)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert the labels lists into numpy arrays
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

In [21]:
training_padded[0]

array([  44,  253,  307,   73, 1117,  805, 2648, 1461,  609, 1462,   34,
         46,  181,  200,  141,  128,  308, 1270, 2033, 2649, 2650, 1698,
       2651, 3784,  570, 3785, 3786,  806,  740,  106,  571, 2652,  897,
        106, 1118,  467,  468, 2653, 1699, 1119,  807, 2654, 2655, 3787,
        687, 3788,   27, 2656,  114, 2034, 3789, 3790, 3791, 2657,  469,
       1700,  469, 3792, 2035, 1462, 3793,  637, 1271,  470, 3794,  195,
        377,  610,   24, 3795, 3796,   73, 1701,  638, 3797,  104, 1702,
        688, 1703,    2,  233,    4,  471,  128,   51,   14,   22,  100,
        995, 3798,  100,  995, 1704, 2036, 1705, 3799,   14,  181,   10,
        165,   15,  200,   10,  165,   15,   73,   10,  165,   64,   14,
          9, 2036, 1705, 1463,   13, 1120, 2658,    2, 3800,   65,  995,
        117, 2037,  215, 2659,  117, 2037,  215, 2038,    6,   11, 3801,
          3,  215,   76,    2, 1706,  254,    2,  409,   60,  266,  808,
          4,   11,   20, 2660,    6,  176,  410,  2

In [22]:
# Parameters
embedding_dim = 100
lstm_dim = 100
dense_dim = 100

# Model Definition with LSTM
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim)),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(25, activation='softmax')
])

# Set the training parameters
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          1000000   
                                                                 
 bidirectional (Bidirectiona  (None, 200)              160800    
 l)                                                              
                                                                 
 dense (Dense)               (None, 100)               20100     
                                                                 
 dense_1 (Dense)             (None, 25)                2525      
                                                                 
Total params: 1,183,425
Trainable params: 1,183,425
Non-trainable params: 0
_________________________________________________________________


In [23]:
NUM_EPOCHS = 50

# Train the model
history_lstm = model_lstm.fit(training_padded, training_labels, epochs=NUM_EPOCHS, validation_data=(testing_padded, testing_labels))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
prediksi = model_lstm.predict(testing_padded)



In [25]:
prediksi[0]

array([2.54785100e-05, 1.05231295e-04, 2.96918879e-04, 5.97009659e-07,
       9.39074438e-04, 8.20373818e-02, 5.14496758e-04, 1.43140424e-02,
       4.16411683e-02, 2.77302483e-08, 1.54275503e-09, 7.02387318e-02,
       6.90898742e-05, 3.04471115e-07, 8.19922483e-04, 2.25306638e-02,
       5.28381788e-05, 1.44867081e-05, 5.18996217e-07, 5.65519382e-04,
       1.07761263e-03, 6.15180075e-01, 8.87991413e-02, 1.23808121e-07,
       6.07765764e-02], dtype=float32)

In [31]:
model_lstm.predict(testing_padded[0].reshape(1, 500))



array([[2.5478532e-05, 1.0523138e-04, 2.9691856e-04, 5.9700960e-07,
        9.3907438e-04, 8.2037374e-02, 5.1449670e-04, 1.4314052e-02,
        4.1641124e-02, 2.7730193e-08, 1.5427548e-09, 7.0238613e-02,
        6.9089736e-05, 3.0447109e-07, 8.1992318e-04, 2.2530630e-02,
        5.2838273e-05, 1.4486679e-05, 5.1899667e-07, 5.6551909e-04,
        1.0776136e-03, 6.1518002e-01, 8.8799275e-02, 1.2380823e-07,
        6.0776625e-02]], dtype=float32)

In [32]:
model_lstm.save('model_lstm.h5')