# Looking for implied requirements

Here we take a job posting, preprocess it, and then look for related terms that are not included in the job posting. The most highly correlated terms should be implied relationships to the job posting.

In [237]:
import pandas as pd
related = pd.read_csv('related.csv', header=0, index_col=0)
# convert nan to 0
related = related.fillna(0)
related

Unnamed: 0_level_0,401k 401k,ability,able,accenture,access,access control,access management,accommodation accenture,accommodation needs,accommodation requests,...,work environment,work experience,working,world's,writing,years,years experience,york,youll,zscaler
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
401k 401k,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
ability,18.972563,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
able,16.303895,17.212258,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
accenture,0.000000,21.112659,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
access,9.615545,14.269546,11.638579,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
years,0.000000,21.582796,20.658280,0.000000,0.000000,19.377498,14.886757,0.000000,0.000000,0.000000,...,34.542314,100.000000,100.000000,0.000000,100.000000,0.000000,0.000000,0.000000,0.0,0.0
years experience,12.909682,13.372802,14.518554,25.765128,19.467647,15.689718,20.706053,20.858609,20.858609,23.273310,...,57.037997,79.189302,43.688725,38.713423,40.108565,61.954849,0.000000,0.000000,0.0,0.0
york,28.054374,16.350340,16.607327,0.000000,12.273082,9.071091,31.200129,0.000000,0.000000,20.905279,...,0.000000,94.274969,64.583333,35.973061,0.000000,100.000000,100.000000,0.000000,0.0,0.0
youll,28.930031,15.810552,5.087428,16.264580,35.015035,12.268378,8.285645,14.699020,14.699020,0.000000,...,33.678756,45.967405,26.531863,100.000000,8.584640,15.187276,98.722951,50.507511,0.0,0.0


In [238]:
from util import Preprocessor, TextBlobTokenizer

In [239]:
import pickle
with open('vectorizer.bin', 'rb') as f:
 vectorizer = pickle.load(f, fix_imports=True)

vectorizer.transform(['test'])

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [240]:
vec = None
with open('job_posting.txt', 'r') as f:
  vec = vectorizer.transform([f.read()])


df_count = pd.DataFrame(vec.todense(), columns=vectorizer.get_feature_names_out())
df_count

Unnamed: 0,401k 401k,ability,able,accenture,access,access control,access management,accommodation accenture,accommodation needs,accommodation requests,...,work experience,working,world's,writing,years,years experience,york,youll,your,zscaler
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
# Extract the top 30 terms, if any are zero exclude those. (Just for comparison)
top_30 = df_count.iloc[0].sort_values()[-30:]
top_terms = top_30.iloc[top_30.to_numpy().nonzero()]
top_terms

experience                    0.143395
minimum                       0.145004
cybersecurity                 0.151618
collaborate                   0.165789
architecture                  0.187443
internal                      0.199719
enterprise                    0.209358
governance                    0.213252
consulting                    0.215812
minimum qualifications        0.216142
investigate                   0.217483
direct                        0.229350
compliance management         0.231114
sr                            0.245047
sensitive data                0.246871
technical experience          0.247495
identity access management    0.250738
operations                    0.285771
position                      0.412578
Name: 0, dtype: float64

In [242]:
df_count.iloc[0]

401k 401k           0.0
ability             0.0
able                0.0
accenture           0.0
access              0.0
                   ... 
years experience    0.0
york                0.0
youll               0.0
your                0.0
zscaler             0.0
Name: 0, Length: 1000, dtype: float64

In [243]:
# Get the top n most related terms based on the relatedness matrix.

# For each term in our top selection, find the highest scoring neighbors

# Create an empty series that contains all of our terms
neighbors = df_count.copy()
neighbors[:] = 0

for term, val  in top_terms.items():
  neighbors += related[term]*val

# Extract out non-zero values
neighbors = neighbors.squeeze().sort_values(ascending=False)
neighbors = neighbors[neighbors!=0]

neighbors

visa                115.257117
years               112.924005
test                111.038742
risk compliance     108.229650
technical skills    108.196635
                       ...    
or                         NaN
were                       NaN
who                        NaN
will                       NaN
your                       NaN
Name: 0, Length: 928, dtype: float64

In [244]:
# Print out the top twenty related terms
neighbors[:20]

visa                     115.257117
years                    112.924005
test                     111.038742
risk compliance          108.229650
technical skills         108.196635
texas                    102.174998
wireshark                100.848272
veteran                   99.590403
providing                 99.501364
willingness               97.624900
technical teams           97.544224
training                  97.075418
team player               97.020739
tuition reimbursement     96.326865
york                      94.323678
team environment          93.927456
sr.                       92.981114
technical expertise       91.392812
virginia                  91.243785
vulnerability             91.130342
Name: 0, dtype: float64