## JOB SECTOR PREDICTION

#### LIBRARIES

In [245]:
import re
import pickle
from PyPDF2 import PdfReader
import numpy as np

#### EXTRACT TEXT FROM PDF

In [246]:
# FILE LOCATION
file = open('Sales-Executive-Resume.pdf', 'rb')

In [247]:
# PDF EXTRACTION
def extract_pdf(file_path):
    texts = []
    pdf_reader = PdfReader(file_path)
    for i in range(0, len(pdf_reader.pages)):
        texts.append("".join(pdf_reader.pages[i].extract_text()))
    return ' '.join(texts)

#### LOADING THE MODEL & VECTORIZER & LABELS

In [248]:
with open('vectorizer_model.pkl', 'rb') as vfile:
    vec = pickle.load(vfile)
    
with open('rf_model.pkl', 'rb') as mfile:
    loaded_model = pickle.load(mfile)
    
with open('labels.pkl', 'rb') as lfile:
    labels = pickle.load(lfile)

#### TEXT PRE-PROCESSING

In [249]:
def text_cleaning(Text):
    Text = re.sub('\r\n', ' ', Text) # remove blank spaces
    Text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-/.:;<=>?@[\]^_`{|}~"""), ' ', Text)  # remove punctuations
    Text = re.sub(r'[^\x00-\x7f]',r' ', Text) # non ascii values
    Text = re.sub('\s+', ' ', Text)  # remove extra whitespace
    Text = Text.lower() # convert to lower case
    return Text

#### PREDICTION

In [250]:
def prediction(data):

    processed_data = []
    input_text = text_cleaning(data)
    processed_data.append(input_text)
    
    test = vec.transform(processed_data).toarray()
    pred = loaded_model.predict_proba(test)
    
    classes = np.argsort(pred)[:, -5:][:, ::-1]
    
    label_map = dict((v,k) for k,v in labels.items()) 
    top5_predictions = [label_map[k] for k in classes[0]] 

    print('\nTOP FIVE PREDICTED RESULTS\n -> ', top5_predictions)
    print('\nTOP MOST PREDICTED RESULTS\n -> ', top5_predictions[0])
    
    return classes

In [251]:
extract_pdf_data = extract_pdf(file)

In [252]:
prediction(extract_pdf_data)


TOP FIVE PREDICTED RESULTS
 ->  ['Sales', 'Healthcare', 'Customer service', 'Retail', 'Marketing']

TOP MOST PREDICTED RESULTS
 ->  Sales


array([[24, 10,  4, 23, 17]], dtype=int64)