In [69]:
import pandas as pd
import numpy as np
df = pd.read_csv('../jobs.csv')
df.columns

Index(['Job Title', 'Job Description', 'Job Type', 'Categories', 'Location',
       'City', 'State', 'Country', 'Zip Code', 'Address', 'Salary From',
       'Salary To', 'Salary Period', 'Apply Url', 'Apply Email', 'Employees',
       'Industry', 'Company Name', 'Employer Email', 'Employer Website',
       'Employer Phone', 'Employer Logo', 'Companydescription',
       'Employer Location', 'Employer City', 'Employer State',
       'Employer Country', 'Employer Zip Code', 'Uniq Id', 'Crawl Timestamp'],
      dtype='object')

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
df = df[['Job Title','Job Description']]

In [72]:
df[:300]

Unnamed: 0,Job Title,Job Description
0,Shift Manager,"<div id=""jobDescriptionText"" class=""jobsearch-..."
1,Operations Support Manager,"<div id=""jobDescriptionText"" class=""jobsearch-..."
2,Senior Product Manager - Data,"<div id=""jobDescriptionText"" class=""jobsearch-..."
3,Part-Time Office Concierge,"<div id=""jobDescriptionText"" class=""jobsearch-..."
4,Print & Marketing Associate,"<div id=""jobDescriptionText"" class=""jobsearch-..."
...,...,...
295,GMIT,"<div id=""jobDescriptionText"" class=""jobsearch-..."
296,"Manager, Business Optimization","<div id=""jobDescriptionText"" class=""jobsearch-..."
297,Logistics and Dispatcher Coordinator,"<div id=""jobDescriptionText"" class=""jobsearch-..."
298,Senior DevOps Engineer,"<div id=""jobDescriptionText"" class=""jobsearch-..."


In [73]:
import re
# as per recommendation from @freylis, compile once only
HTML_CLEANR = re.compile('<.*?>') 
BLANK_CLEANR = re.compile('\n')

def cleanhtml(raw_html):
  cleantext = re.sub(HTML_CLEANR, '', raw_html)
  #cleantext = re.sub('\n+', ' ',raw_html)
  return cleantext

def remove(s):
    # Pattern for matching whitespaces
    pattern=r"\s+"

    # Using re.sub() function to remove whitespaces
    s = re.sub(pattern, " ", s)
    return s

In [74]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from functools import reduce
 
ps = PorterStemmer()

stop_words = set(stopwords.words('english'))

In [75]:
df.loc[2,'Job Description']

'<div id="jobDescriptionText" class="jobsearch-jobDescriptionText">Product Manager - Unique opportunity with a mobile marketing leader\n<br><br>\nVibes helps brands unlock new revenue by arming them with the technology and expertise they need to succeed in mobile marketing.\n<br><br>\nWe are a group of passionate technologists, intent on solving hard problems in the evolving world of mobile engagement. We focus on collaboration and continuous improvement of our products and ourselves.\n<br><br>\nYou are a Product Manager with a passion for data and creating amazing customer experiences to connect people to the brands they love. You have experience delivering successful, data informed SaaS experiences and are excited to own our Data experience and continue making it the key differentiator of our platform. You are energized by setting a vision, experimenting to validate or invalidate assumptions, and leading a product experience team to build software that will interact with millions of 

In [78]:
s = cleanhtml(df.loc[2,'Job Description'])
s = remove(s)
s.strip()

words = word_tokenize(s)
stemmed_sentence = reduce(lambda x, y: x + " " + ps.stem(y), words, "")
words = word_tokenize(stemmed_sentence)
' '.join(words)

s

"Product Manager - Unique opportunity with a mobile marketing leader Vibes helps brands unlock new revenue by arming them with the technology and expertise they need to succeed in mobile marketing. We are a group of passionate technologists, intent on solving hard problems in the evolving world of mobile engagement. We focus on collaboration and continuous improvement of our products and ourselves. You are a Product Manager with a passion for data and creating amazing customer experiences to connect people to the brands they love. You have experience delivering successful, data informed SaaS experiences and are excited to own our Data experience and continue making it the key differentiator of our platform. You are energized by setting a vision, experimenting to validate or invalidate assumptions, and leading a product experience team to build software that will interact with millions of people via their phones. Responsibilities Define data strategy for our platform Manage product deve

In [79]:
for i in range(len(df)):
    s = cleanhtml(df.loc[i,'Job Description'])
    s = remove(s)
    s.strip()
    
    words = word_tokenize(s)
#     stemmed_sentence = reduce(lambda x, y: x + " " + ps.stem(y), words, "")
#     words = word_tokenize(stemmed_sentence)
    filtered_sentence = [w for w in words if not w.lower() in stop_words]
    
    df.loc[i,'Job Description'] = ' '.join(filtered_sentence)


In [80]:
df.loc[2,'Job Description']

"Product Manager - Unique opportunity mobile marketing leader Vibes helps brands unlock new revenue arming technology expertise need succeed mobile marketing . group passionate technologists , intent solving hard problems evolving world mobile engagement . focus collaboration continuous improvement products . Product Manager passion data creating amazing customer experiences connect people brands love . experience delivering successful , data informed SaaS experiences excited Data experience continue making key differentiator platform . energized setting vision , experimenting validate invalidate assumptions , leading product experience team build software interact millions people via phones . Responsibilities Define data strategy platform Manage product development strategy lean hypotheses & amp ; experimentation , defining customer experiences journeys , owning managing product backlog Lead team take potential data , models , outputs integrate current platform experience potential ne

In [81]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Job Description']).toarray()
vocab = tfidf.vocabulary_
reverse_vocab = {v:k for k,v in vocab.items()}
feature_names = tfidf.get_feature_names_out()
df_tfidf = pd.DataFrame(X_tfidf, columns = feature_names)
idx = X_tfidf.argsort(axis=1)

In [82]:
tfidf_max2 = idx[:,-2:]

In [83]:
print ([[(reverse_vocab.get(item), X_tfidf[i, item])  for item in row] 
                     for i, row in enumerate(tfidf_max2) ])

[[('guest', 0.25332621432566116), ('gm', 0.25637564878519087)], [('ability', 0.16858466136763411), ('restaurant', 0.4020204988382898)], [('vibes', 0.3564793327079397), ('data', 0.3703716636710829)], [('office', 0.3442380838581034), ('agents', 0.41892226987662323)], [('print', 0.3799754850936955), ('copy', 0.41091082624901526)], [('risk', 0.48552517601231554), ('cyber', 0.6481255469833854)], [('nutritional', 0.2957297713165688), ('gnc', 0.45691607578490395)], [('bank', 0.3944401918562681), ('mortgage', 0.419924831280332)], [('property', 0.24980481166536395), ('prg', 0.43132327691750943)], [('cannabis', 0.3138122532280034), ('gti', 0.5465968299726927)], [('copywriter', 0.24369819045420005), ('hcp', 0.2444904438126804)], [('event', 0.17224697858726679), ('production', 0.23138792968092026)], [('sally', 0.3212129904831611), ('beauty', 0.578263393924879)], [('cost', 0.2376443939162537), ('l3harris', 0.5442988534434979)], [('hewitt', 0.29884871146670966), ('tax', 0.5679904824497791)], [('sand