# Workshop

This is where I'm trying out new ideas before I move them into a proper python file

## Step 1:

Load dataset

In [17]:
# Load in dataset

import pandas as pd

df = pd.read_csv('../datasets/LinkedInCyberJobs.csv')
df

Unnamed: 0,_id,title,company,location,role,description
0,8dae0e13c2d5314a61962f9e8216daa1,IT Compliance Analyst - University of North Te...,University of North Texas System,"Denton, TX",IT Compliance Analyst,UNT System Overview\nWelcome to the University...
1,acbd8b58af92fdd1a3c2d7a4902712e6,Senior Information Security Analyst/IT Auditor...,HITRUST Service Corp,"Frisco, TX",Senior Information Security Analyst/IT Auditor,Description:\nSummary:\nThe HITRUST Assurance ...
2,ff9a9dabe363b92f62d676d65017c9ba,Network Security Analyst - Addus Homecare - Fr...,Addus Homecare,"Frisco, TX",Network Security Analyst,Shift: Overnight including weekends\nPosition ...
3,602c6dff4ebf9350376092945b84ec59,"Threat Analyst (Remote, USA) - Optiv - Frisco, TX",Optiv,"Frisco, TX","Threat Analyst (Remote, USA)","At Optiv, we’re on a mission to help our clien..."
4,e111f13ba66761887fbd370a6835008c,Information Security Analyst - Remote - Conife...,Remote,Conifer Health Solutions,Information Security Analyst,As a part of the Tenet and Catholic Health Ini...
...,...,...,...,...,...,...
4498,6e5a3e93d4e34c6ab8af6935188ab1b6,Cyber Security Analyst (FS Polygraph) - Zachar...,Zachary Piper Solutions,"McLean, VA",Cyber Security Analyst (FS Polygraph),Zachary Piper Solutions is seeking a fully cle...
4499,eebe7608a6e1de391afa0d4356c99a25,Information Security Analyst - Pinnacle Group ...,Pinnacle Group Workplace Solutions Provider,"McLean, VA",Information Security Analyst,Title:\nInformation Security Analyst\nLocation...
4500,f3356f8496dc27545cf3599312759b47,Information Security Analyst - soho square sol...,soho square solutions,"McLean, VA",Information Security Analyst,Information Security Analyst\nGreetings from S...
4501,ebb1354fe1240c7638c25fa96bbb74e0,Supervisor - Vulnerability Management - TEGNA ...,Vulnerability Management,TEGNA,Supervisor,This position will be responsible for supervis...


## Step 2:

Clean up the data
- Vectorize the data
  - Ignore under three letter words
  - Lemmatize the words
  - Ignore words that occur less than n times
- Strip out stop words
- Strip out numbers

In [18]:
# Vectorize the descriptions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Make a tokenizer that lemmatizes as it goes
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize  
import re
# From here:  https://stackoverflow.com/questions/47423854/sklearn-adding-lemmatizer-to-countvectorizer
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        regex_num_ponctuation = '(\d+)|([^\w\s])'
        # Ignore ine and two letter words
        regex_little_words = r'(\b\w{1,3}\b)'
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) 
                if not re.search(regex_num_ponctuation, t) and not re.search(regex_little_words, t)]

# Words are stipped of their accents, and only terms that appear in at least n %
count_vec = TfidfVectorizer(strip_accents='unicode', min_df=0.1, tokenizer=LemmaTokenizer())
desc_vecs = count_vec.fit_transform(df.description)
desc_vecs

<4503x682 sparse matrix of type '<class 'numpy.float64'>'
	with 824901 stored elements in Compressed Sparse Row format>

In [19]:
df_count = pd.DataFrame(desc_vecs.todense(), columns=count_vec.get_feature_names_out())
df_count

Unnamed: 0,ability,able,about,access,accommodation,accordance,account,achieve,across,action,...,work,workforce,working,workplace,world,would,writing,written,year,your
0,0.135132,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.022150,0.025519,...,0.129179,0.000000,0.058119,0.0,0.053656,0.000000,0.000000,0.000000,0.073797,0.000000
1,0.110574,0.046029,0.039760,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.046979,0.000000,0.158524,0.0,0.000000,0.139899,0.000000,0.000000,0.024154,0.037171
2,0.058404,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.088236,...,0.074442,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.041212,0.000000,0.000000
3,0.192575,0.000000,0.000000,0.023585,0.000000,0.0,0.0,0.0,0.042087,0.000000,...,0.068182,0.000000,0.036811,0.0,0.000000,0.000000,0.000000,0.022648,0.014022,0.021579
4,0.064744,0.107805,0.000000,0.222023,0.154863,0.0,0.0,0.0,0.000000,0.000000,...,0.073353,0.000000,0.000000,0.0,0.000000,0.054609,0.000000,0.000000,0.037715,0.029019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4498,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.048324,...,0.027180,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.027949,0.043010
4499,0.022859,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.019424,0.000000,0.000000,0.0,0.000000,0.057844,0.049902,0.032261,0.079897,0.000000
4500,0.070216,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.059665,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.049547,0.184062,0.047208
4501,0.137876,0.000000,0.049577,0.000000,0.061836,0.0,0.0,0.0,0.090399,0.000000,...,0.000000,0.084512,0.039533,0.0,0.000000,0.000000,0.000000,0.000000,0.030119,0.000000


In [20]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')


# Remove stopwords
# print(stopwords.words('english'))
df_count.drop(stopwords.words('english'), axis=1, errors='ignore', inplace=True)

# Remove numbers
df_count = df_count[df_count.columns.drop(df_count.filter(regex = '[0-9]').columns)]

# Remove underscores
df_count = df_count[df_count.columns.drop(df_count.filter(regex = '_').columns)]

df_count

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\John\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ability,able,access,accommodation,accordance,account,achieve,across,action,active,...,without,work,workforce,working,workplace,world,would,writing,written,year
0,0.135132,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.022150,0.025519,0.000000,...,0.000000,0.129179,0.000000,0.058119,0.0,0.053656,0.000000,0.000000,0.000000,0.073797
1,0.110574,0.046029,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.040604,0.046979,0.000000,0.158524,0.0,0.000000,0.139899,0.000000,0.000000,0.024154
2,0.058404,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.088236,0.000000,...,0.042893,0.074442,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.041212,0.000000
3,0.192575,0.000000,0.023585,0.000000,0.0,0.0,0.0,0.042087,0.000000,0.031472,...,0.023572,0.068182,0.000000,0.036811,0.0,0.000000,0.000000,0.000000,0.022648,0.014022
4,0.064744,0.107805,0.222023,0.154863,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.073353,0.000000,0.000000,0.0,0.000000,0.054609,0.000000,0.000000,0.037715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4498,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.048324,0.000000,...,0.000000,0.027180,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.027949
4499,0.022859,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.044831,...,0.000000,0.019424,0.000000,0.000000,0.0,0.000000,0.057844,0.049902,0.032261,0.079897
4500,0.070216,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.068852,...,0.000000,0.059665,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.049547,0.184062
4501,0.137876,0.000000,0.000000,0.061836,0.0,0.0,0.0,0.090399,0.000000,0.000000,...,0.000000,0.000000,0.084512,0.039533,0.0,0.000000,0.000000,0.000000,0.000000,0.030119


In [31]:
# count_vec.stop_words_
# Take a look at the top 30 words from a document
df_count.iloc[50].sort_values()[-30:]

creating        0.079045
responsible     0.079076
assist          0.081471
analyzing       0.084603
position        0.085144
sponsorship     0.085210
analysis        0.087375
duty            0.088461
regarding       0.090432
reporting       0.090817
life            0.091985
applicable      0.092198
pregnancy       0.092493
also            0.094552
professional    0.094604
future          0.095301
resource        0.102751
asset           0.105669
program         0.111141
business        0.113111
cloud           0.116332
integrity       0.118114
incident        0.124229
eligible        0.130974
client          0.136022
corporate       0.145373
federal         0.159494
monitoring      0.167168
information     0.179060
security        0.234266
Name: 50, dtype: float64

# Step 3

Compute "relatedness" for each word pair

$$\alpha(w_1, w_2) = \frac{\sum_{v \in V}{v[w_1]*v[w_2]}}{|\{x \in V | w_1, w_2 \in x \}|}$$

In [22]:
print(df.shape[0])

4503


In [38]:
from itertools import combinations
from collections import defaultdict
from tqdm import tqdm

def relatedness(df):
  # returns a map of relatedness for every pair of words

  threshold = 0

  df_out = pd.DataFrame(index=df.columns, columns=df.columns)
  df_out.rename_axis("source", axis=0, inplace=True)
  df_out.rename_axis("target", axis=1, inplace=True)

  for (word1, word2) in tqdm(list(combinations(df.columns,2))):
    seq = df[word1].multiply(df[word2])
    sum = seq.sum()
    count = seq[seq > 0].count()
    # print(sum)
    if sum > threshold:
      df_out[word1][word2] = round(sum/count,5)

  return df_out

relate = relatedness(df_count)


100%|██████████| 206403/206403 [01:12<00:00, 2865.68it/s]


In [50]:
# Normalize the weights
# weights = [float(i) for i in relate[2]]
# m = max(weights)
# weights = [(i/m)*100 for i in weights]
# thresh = 10
# weights = [i if i > thresh else 0 for i in weights]
# max(weights)
relate = (relate/relate.max()) * 100
relate

target,ability,able,access,accommodation,accordance,account,achieve,across,action,active,...,without,work,workforce,working,workplace,world,would,writing,written,year
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ability,,,,,,,,,,,...,,,,,,,,,,
able,33.58999,,,,,,,,,,...,,,,,,,,,,
access,36.092397,33.901919,,,,,,,,,...,,,,,,,,,,
accommodation,38.883542,38.059701,54.004955,,,,,,,,...,,,,,,,,,,
accordance,29.643888,27.82516,29.562345,26.923077,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
world,25.697786,28.678038,25.598679,37.5,27.887324,25.617566,25.124131,34.398977,34.363853,31.25,...,74.509804,93.650794,96.790123,81.736527,100.0,,,,,
would,37.05486,31.769723,28.323699,55.251479,30.798122,37.968893,23.038729,34.2711,30.796671,33.958333,...,73.333333,100.0,88.888889,100.0,100.0,100.0,,,,
writing,33.58999,32.089552,27.085054,27.884615,27.605634,35.041171,25.620655,33.887468,28.180737,38.854167,...,74.117647,82.993197,75.802469,80.538922,88.56305,84.175084,100.0,,,
written,22.810395,22.921109,16.350124,18.639053,20.56338,21.866423,16.385303,21.355499,18.668252,21.25,...,46.666667,54.875283,52.592593,54.191617,57.184751,58.249158,67.251462,92.050209,,


In [51]:
# Import library
from d3graph import d3graph, vec2adjmat
# Initialization
d3 = d3graph(slider=[50,100], charge=1000)

# Set node properties

# Plot
d3.graph(relate)
# d3.set_node_properties(color='#FFFFFF')
d3.show()

[d3graph] INFO> Set directed=True to see the markers!
[d3graph] INFO> Keep only edges with weight>0
[d3graph] INFO> Number of unique nodes: 643
[d3graph] INFO> Slider range is set to [3, 100]
[d3graph] INFO> Write to path: [C:\Users\John\AppData\Local\Temp\tmpcfipqxhv\d3graph.html]


<networkx.classes.digraph.DiGraph at 0x228a623a380>