# Workshop

This is where I'm trying out new ideas before I move them into a proper python file

## Step 1:

Load dataset

In [56]:
# Load in dataset

import pandas as pd

df = pd.read_csv('../datasets/LinkedInCyberJobs.csv')
df

Unnamed: 0,_id,title,company,location,role,description
0,8dae0e13c2d5314a61962f9e8216daa1,IT Compliance Analyst - University of North Te...,University of North Texas System,"Denton, TX",IT Compliance Analyst,UNT System Overview\nWelcome to the University...
1,acbd8b58af92fdd1a3c2d7a4902712e6,Senior Information Security Analyst/IT Auditor...,HITRUST Service Corp,"Frisco, TX",Senior Information Security Analyst/IT Auditor,Description:\nSummary:\nThe HITRUST Assurance ...
2,ff9a9dabe363b92f62d676d65017c9ba,Network Security Analyst - Addus Homecare - Fr...,Addus Homecare,"Frisco, TX",Network Security Analyst,Shift: Overnight including weekends\nPosition ...
3,602c6dff4ebf9350376092945b84ec59,"Threat Analyst (Remote, USA) - Optiv - Frisco, TX",Optiv,"Frisco, TX","Threat Analyst (Remote, USA)","At Optiv, we’re on a mission to help our clien..."
4,e111f13ba66761887fbd370a6835008c,Information Security Analyst - Remote - Conife...,Remote,Conifer Health Solutions,Information Security Analyst,As a part of the Tenet and Catholic Health Ini...
...,...,...,...,...,...,...
4498,6e5a3e93d4e34c6ab8af6935188ab1b6,Cyber Security Analyst (FS Polygraph) - Zachar...,Zachary Piper Solutions,"McLean, VA",Cyber Security Analyst (FS Polygraph),Zachary Piper Solutions is seeking a fully cle...
4499,eebe7608a6e1de391afa0d4356c99a25,Information Security Analyst - Pinnacle Group ...,Pinnacle Group Workplace Solutions Provider,"McLean, VA",Information Security Analyst,Title:\nInformation Security Analyst\nLocation...
4500,f3356f8496dc27545cf3599312759b47,Information Security Analyst - soho square sol...,soho square solutions,"McLean, VA",Information Security Analyst,Information Security Analyst\nGreetings from S...
4501,ebb1354fe1240c7638c25fa96bbb74e0,Supervisor - Vulnerability Management - TEGNA ...,Vulnerability Management,TEGNA,Supervisor,This position will be responsible for supervis...


In [68]:
from util import Preprocessor, TextBlobTokenizer

In [69]:
tok = TextBlobTokenizer()
pp = Preprocessor()
t = "Hello there, general kenobi. we'll need a hundred security personel with engineering skills."
t = pp(t)
blob = tok(t)
blob

['hello', 'general kenobi', 'need', 'security personel', 'engineering skills']

## Step 2:

Clean up the data
- Vectorize the data
  - Ignore under three letter words
  - Lemmatize the words
  - Ignore words that occur less than n times
- Strip out stop words
- Strip out numbers

In [38]:
# Vectorize the descriptions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


# Words are stipped of their accents, get the top 1000 features that are in less than 95% of documents
count_vec = TfidfVectorizer(strip_accents='unicode', max_features=1000, max_df=.95, tokenizer=TextBlobTokenizer(), preprocessor=Preprocessor())
desc_vecs = count_vec.fit_transform(df.description)
desc_vecs

<4503x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 191389 stored elements in Compressed Sparse Row format>

In [66]:
import pickle

with open('./vectorizer.bin','wb') as f:
  pickle.dump(count_vec, f)

In [39]:
df_count = pd.DataFrame(desc_vecs.todense(), columns=count_vec.get_feature_names_out())
df_count

Unnamed: 0,401k 401k,ability,able,accenture,access,access control,access management,accommodation accenture,accommodation needs,accommodation requests,...,work experience,working,world's,writing,years,years experience,york,youll,your,zscaler
0,0.0,0.091571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.055604,0.00000,0.0,0.0,0.0
1,0.0,0.064593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.103185,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
3,0.0,0.213498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
4,0.0,0.051701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4498,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
4499,0.0,0.062670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0
4500,0.0,0.059750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.41725,0.0,0.0,0.0
4501,0.0,0.240551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.146068,0.00000,0.0,0.0,0.0


In [40]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')


# Remove stopwords
# print(stopwords.words('english'))
df_count.drop(stopwords.words('english'), axis=1, errors='ignore', inplace=True)

# Remove numbers
# df_count = df_count[df_count.columns.drop(df_count.filter(regex = '[0-9]').columns)]

# Remove underscores
df_count = df_count[df_count.columns.drop(df_count.filter(regex = '_').columns)]

df_count

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\John\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,401k 401k,ability,able,accenture,access,access control,access management,accommodation accenture,accommodation needs,accommodation requests,...,work environment,work experience,working,world's,writing,years,years experience,york,youll,zscaler
0,0.0,0.091571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.055604,0.00000,0.0,0.0
1,0.0,0.064593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.103185,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0
3,0.0,0.213498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0
4,0.0,0.051701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.119894,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4498,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0
4499,0.0,0.062670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0
4500,0.0,0.059750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.41725,0.0,0.0
4501,0.0,0.240551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.146068,0.00000,0.0,0.0


In [41]:
# count_vec.stop_words_
# Take a look at the top 30 words from a document
df_count.iloc[50].sort_values()[-30:]

senior management                  0.118044
security solutions                 0.119444
etc ..                             0.120775
wide range                         0.122168
tx                                 0.124234
oscp                               0.126949
software                           0.127116
log analysis                       0.127625
oral communication skills          0.130119
high level                         0.130681
high degree                        0.131062
gcia                               0.133454
vaccination status                 0.139147
technical skills                   0.140167
provide                            0.140660
network protocols                  0.141223
financial                          0.142879
utilize                            0.144036
criminal histories                 0.145240
covid-19                           0.148767
threat actors                      0.149521
remediation efforts                0.151724
threat analysis                 

# Step 3

Compute "relatedness" for each word pair

$$\alpha(w_1, w_2) = \frac{\sum_{v \in V}{v[w_1]*v[w_2]}}{|\{x \in V | w_1, w_2 \in x \}|}$$

In [42]:
print(df.shape[0])

4503


In [43]:
from itertools import combinations
from collections import defaultdict
from tqdm import tqdm

def relatedness(df):
  # returns a map of relatedness for every pair of phrases

  threshold = 0

  df_out = pd.DataFrame(index=df.columns, columns=df.columns)
  df_out.rename_axis("source", axis=0, inplace=True)
  df_out.rename_axis("target", axis=1, inplace=True)

  for (word1, word2) in tqdm(list(combinations(df.columns,2))):
    seq = df[word1].multiply(df[word2])
    sum = seq.sum()
    count = seq[seq > 0].count()
    # print(sum)
    if sum > threshold:
      df_out[word1][word2] = round(sum/count,5)

  return df_out

relate = relatedness(df_count)


100%|██████████| 488566/488566 [02:37<00:00, 3103.08it/s]


In [44]:
# Normalize the weights
# weights = [float(i) for i in relate[2]]
# m = max(weights)
# weights = [(i/m)*100 for i in weights]
# thresh = 10
# weights = [i if i > thresh else 0 for i in weights]
# max(weights)
relate = (relate/relate.max()) * 100
relate

target,401k 401k,ability,able,accenture,access,access control,access management,accommodation accenture,accommodation needs,accommodation requests,...,work environment,work experience,working,world's,writing,years,years experience,york,youll,zscaler
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
401k 401k,,,,,,,,,,,...,,,,,,,,,,
ability,18.972563,,,,,,,,,,...,,,,,,,,,,
able,16.303895,17.212258,,,,,,,,,...,,,,,,,,,,
accenture,,21.112659,,,,,,,,,...,,,,,,,,,,
access,9.615545,14.269546,11.638579,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
years,,21.582796,20.65828,,,19.377498,14.886757,,,,...,34.542314,100.0,100.0,,100.0,,,,,
years experience,12.909682,13.372802,14.518554,25.765128,19.467647,15.689718,20.706053,20.858609,20.858609,23.27331,...,57.037997,79.189302,43.688725,38.713423,40.108565,61.954849,,,,
york,28.054374,16.35034,16.607327,,12.273082,9.071091,31.200129,,,20.905279,...,,94.274969,64.583333,35.973061,,100.0,100.0,,,
youll,28.930031,15.810552,5.087428,16.26458,35.015035,12.268378,8.285645,14.69902,14.69902,,...,33.678756,45.967405,26.531863,100.0,8.58464,15.187276,98.722951,50.507511,,


In [58]:
with open('related.csv', 'w') as f:
  f.write(relate.to_csv())
  

In [45]:
# Import library
from d3graph import d3graph, vec2adjmat
# Initialization
d3 = d3graph(slider=[50,100], charge=1000)

# Set node properties

# Plot
d3.graph(relate)
d3.show(figsize=(5000,5000), filepath='./output.html', overwrite=True)

[d3graph] INFO> Set directed=True to see the markers!
[d3graph] INFO> Keep only edges with weight>0
[d3graph] INFO> Number of unique nodes: 989
[d3graph] INFO> Slider range is set to [0, 100]
[d3graph] INFO> Write to path: [d:\projects\JobSearch\src\output.html]
[d3graph] INFO> File already exists and will be overwritten: [d:\projects\JobSearch\src\output.html]


<networkx.classes.digraph.DiGraph at 0x219e8466fb0>