# Analyse job offer text
*created on blog by Charlie Greenbacker [@greenbacker](https://twitter.com/greenbacker)*

In [94]:
import nltk
from bs4 import BeautifulSoup
import re
import pandas as pd
import string

## Extracting text from HTML

In [106]:
from selenium import webdriver 
import os

url = "https://www.linkedin.com/jobs/view/892126521/"

PROJECT_ROOT = os.path.abspath(os.path.dirname('__file__'))
DRIVER_BIN = os.path.join(PROJECT_ROOT, "chromedriver")

driver = webdriver.Chrome(executable_path = DRIVER_BIN)

driver.get(url)

driver.implicitly_wait(20) # seconds
element = driver.find_element_by_xpath('//*[@id="summary-detail"]/div/div/div/div[1]/div')


elementHTML = element.get_attribute('innerHTML')
elementHTML = BeautifulSoup(elementHTML)

text = elementHTML.get_text()
text = text.replace('\n', ' ')
text = text.replace('(', ' ')
text = text.replace(')', ' ')
text = text.replace('/', ' ')
text = re.sub(' +',' ', text)

#input('Press ENTER to close the automated browser')

driver.quit()
print('done')
print('---------------------------------------------')
print('---------------Job Description---------------')
print('---------------------------------------------')
print(text)

done
---------------------------------------------
---------------Job Description---------------
---------------------------------------------
Job descriptionOur Data Science team is divided into six key areas: Discovery, Supply Chain, Retail, Customer Care, Marketing and Core. We have a highly innovative team working on interesting and difficult problems, leveraging cutting edge technology to build solutions and customer facing products. Our team have a direct impact on shaping the future of how customers shop on ASOS, and help shape key business decisions and activities.And yeah… we are pretty cutting edge with the technology we use here: GPU computing with Tensorflow and Keras. Distributed systems with pySpark and Databricks. Cloud computing with Azure: Docker, Kubernetes, data lake, CosmosDB.The team have recently also published papers, check them out here: Customer Lifetime Value Prediction with Embeddings, SIGKDD 17. Generalising Random Forest Parameter Optimisation to Include St

## Frequency Analysis

In [107]:
#download nltk attribute if needed
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/roberthommes/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

#### tokenization

In [124]:
filtered_text = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

#### stopwords

In [125]:
from nltk.corpus import stopwords

filtered_text = [word for word in filtered_text if word not in stopwords.words('english')]

#### punctuation

In [127]:
filtered_text = [''.join(c for c in s if c not in string.punctuation) for s in filtered_text]
filtered_text = [s for s in filtered_text if s]

#### lowercase

In [128]:
filtered_text = [word.lower() for word in filtered_text]

#### Word Stemming

In [129]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
filtered_text_stem = [stemmer.stem(t) for t in filtered_text]

#### Lemmatization

In [130]:
lemmatizer = nltk.WordNetLemmatizer()

filtered_text_lem = [lemmatizer.lemmatize(t) for t in filtered_text]

#### NLTK Frequency Distributions

In [136]:
#nltkText = [filtered_text_stem, filtered_text_lem]

fdist = nltk.FreqDist(filtered_text_lem)

fdistDF = pd.DataFrame.from_dict(fdist, orient='index').reset_index()
fdistDF = fdistDF.rename(columns={'index':'word', 0:'count'})
fdistDF = fdistDF.sort_values(by='count', ascending=False)
print('-----------------head words-----------------')
print(fdistDF.head(10).reset_index(drop=True))
print('-----------------tail words-----------------')
print(fdistDF.tail(10).reset_index(drop=True))

-----------------head words-----------------
         word  count
0        team      6
1    customer      5
2        data      5
3  technology      5
4         you      4
5     product      4
6     problem      4
7        edge      3
8     cutting      3
9      highly      3
-----------------tail words-----------------
            word  count
0    proposition      1
1         search      1
2         engine      1
3          offer      1
4          broad      1
5       exposure      1
6      requiring      1
7          close      1
8  collaboration      1
9     conference      1


## Analyze document words with data science job descripiton database

To indentify what skills are specific for this job vacancy in particulair we have to create a data science job description database. Using this database we can identify what is specific about this job with the steps below.

#### Building a Term-Document Matrix

Use scikit-learn's <code>TfidfVectorizer</code> class to construct a [term-document matrix](http://en.wikipedia.org/wiki/Document-term_matrix) containing the TF-IDF score for each word in each document in the data science job description database. In essence, the rows of this sparse matrix correspond to documents in the corpus, the columns represent each word in the vocabulary of the corpus, and each cell contains the TF-IDF value for a given word in a given document.

#### TF-IDF Scores

Now that we've built the term-document matrix, we can explore its contents:

In [None]:
from random import randint

feature_names = tfidf.get_feature_names()
print 'TDM contains ' + str(len(feature_names)) + ' terms and ' + str(tdm.shape[0]) + ' documents'

print 'first term: ' + feature_names[0]
print 'last term: ' + feature_names[len(feature_names) - 1]

for i in range(0, 4):
    print 'random term: ' + feature_names[randint(1,len(feature_names) - 2)]

#### Generating the Summary

That's all we'll need to produce a summary for any document in the corpus. In the example code below, we start by randomly selecting an article from the data science job description database. We iterate through the article, calculating a score for each sentence by summing the TF-IDF values for each word appearing in the sentence. We normalize the sentence scores by dividing by the number of tokens in the sentence (to avoid bias in favor of longer sentences). Then we sort the sentences by their scores, and return the highest-scoring sentences as our summary. The number of sentences returned corresponds to roughly 20% of the overall length of the article.

In [None]:
import math
from __future__ import division

article_id = randint(0, tdm.shape[0] - 1)
article_text = reuters.raw(reuters.fileids()[article_id])

sent_scores = []
for sentence in nltk.sent_tokenize(article_text):
    score = 0
    sent_tokens = tokenize_and_stem(sentence)
    for token in (t for t in sent_tokens if t in feature_names):
        score += tdm[article_id, feature_names.index(token)]
    sent_scores.append((score / len(sent_tokens), sentence))

summary_length = int(math.ceil(len(sent_scores) / 5))
sent_scores.sort(key=lambda sent: sent[0], reverse=True)

print '*** SUMMARY ***'
for summary_sentence in sent_scores[:summary_length]:
    print summary_sentence[1]

print '\n*** ORIGINAL ***'
print article_text