<a href="https://colab.research.google.com/github/Mai1902/landing/blob/main/skills_cluster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install nltk
!pip install sklearn

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk
from sklearn import feature_extraction
from nltk.stem import PorterStemmer


In [106]:
# Load job description file
df = pd.read_csv('/dice_com-job_us_sample.csv')
im_df = pd.DataFrame(df, columns = ['company', 'employment', 'jobdescription', 'jobtitle', 'skills'])
data_dict = im_df.to_dict()
jd_content = [x for x in data_dict['jobdescription'].values()]

jd_content_sample = jd_content[:1000]


In [10]:
from nltk.corpus import stopwords

In [13]:
#initialize stop words and stemmer for text processing
stopWords = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [32]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
'''Method to tokenize and stemmized the text'''
def tokenize_and_stemmized(text):
  tokens = []

  #Add all tokenized word into list of token
  for sent in nltk.sent_tokenize(text):
    for word in nltk.word_tokenize(sent):
      tokens.append(word)
  filtered_tokens = []

  # Filter out token that only contain letter, + sign,underscore, and number
  for token in tokens:
    if re.search("^[A-Za-z0-9+_-]*$", token):
      filtered_tokens.append(token)
  stems = [stemmer.stem(t) for t in filtered_tokens]
  return stems, filtered_tokens

stemmed_and_filtered = []
filtered_only = []
for jd in jd_content:
  stemmed_and_filtered = tokenize_and_stemmized(jd)[0]
  filtered_only = tokenize_and_stemmized(jd)[1]


In [64]:
print(filtered_only[:20])

['Experience', 'in', 'ProgrammingDevelopment', 'experience', 'in', 'Win32', 'Programming', 'on', 'Win', 'must', 'have', 'Experience', 'using', 'debuggers', 'such', 'as', 'WinDbgExperience', 'on', 'Windows', 'kernel']


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# define vectorizer parameters
jd_forfit = [[jd for jd in jd_content_sample]]
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stemmized, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(jd_forfit)
print(tfidf_matrix.shape)

In [54]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [55]:
# Switch to LDA approach
from gensim import corpora, models, similarities


In [None]:
!pip install rake_nltk

In [92]:
from rake_nltk import Rake

In [107]:
# Get keyword only from jd_content
rake = Rake()
for jd in jd_content_sample:
  rake.extract_keywords_from_text(jd)
keywords = rake.get_ranked_phrases()


In [108]:
# remove stop words from list of tokenized and stemmed token
filtered_only = [[word for word in tokenize_and_stemmized(keys)[1]] for keys in keywords]

cleaned_jd = [[word for word in text if word not in stopWords] for text in filtered_only]




In [109]:
# Create a Gensim dictionary from the processced job description
dictionary = corpora.Dictionary(cleaned_jd)

# Remove the extreme vocab based on term frequency
dictionary.filter_extremes(no_below=1,no_above=0.8)

#convert dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(jd) for jd in cleaned_jd]

In [110]:
print(dictionary)

Dictionary(389 unique tokens: ['development', 'experience', 'initiatives', 'methodologies', 'modern']...)


In [111]:
# Create LDA model with 20 different clusters
lda = models.LdaModel(corpus, num_topics=20, 
                            id2word=dictionary, 
                            update_every=5, 
                            chunksize=10000, 
                            passes=100)
lda.show_topics()

[(18,
  '0.077*"understanding" + 0.042*"models" + 0.042*"use" + 0.042*"growing" + 0.021*"data" + 0.021*"business" + 0.021*"deep" + 0.021*"ideal" + 0.021*"day" + 0.021*"consists"'),
 (6,
  '0.038*"solutions" + 0.038*"architecture" + 0.038*"etc" + 0.038*"cutting" + 0.020*"solution" + 0.020*"based" + 0.020*"best" + 0.020*"possible" + 0.020*"janeiro" + 0.020*"digital"'),
 (13,
  '0.151*"experience" + 0.076*"team" + 0.031*"members" + 0.016*"extreme" + 0.016*"web" + 0.016*"aws" + 0.016*"restful" + 0.016*"home" + 0.016*"successful" + 0.016*"html5"'),
 (19,
  '0.108*"development" + 0.047*"experience" + 0.031*"practical" + 0.031*"product" + 0.031*"realize" + 0.016*"various" + 0.016*"one" + 0.016*"across" + 0.016*"online" + 0.016*"less"'),
 (5,
  '0.060*"boston" + 0.060*"deployment" + 0.030*"architecture" + 0.030*"models" + 0.030*"software" + 0.030*"leader" + 0.030*"backgrounds" + 0.030*"may" + 0.030*"wide" + 0.030*"www"'),
 (0,
  '0.051*"sql" + 0.051*"integration" + 0.051*"continuous" + 0.045*"

In [118]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix = np.array(topics_matrix)

topic_words = topics_matrix[:,1]
for i in topic_words:
    print([str(word[0]) for word in i])
    print()

['delivery', 'new', 'drive', 'constantly', 'monitoring', 'improve', 'right', 'programming', 'postgresql', 'languages', 'project', 'execution', 'enhance', 'yet', 'problems', 'exciting', 'difficult', 'process', 'teammates', 'combination']

['knowledge', 'specifications', 'skills', 'technical', 'grow', 'get', 'object', 'opportunity', '5', 'general', 'oriented', 'programming', 'mobile', 'growth', 'information', 'important', 'ranging', 'user', 'infrastructure', 'setup']

['collaborate', 'working', 'good', 'directly', 'split', 'responsibilitiesworking', 'context', 'bulletproof', 'number', 'love', 'wields', 'also', 'problem', 'solving', 'solution', 'data', 'architecture', 'creative', 'implementation', 'delivery']

['work', 'project', 'edge', 'benefits', 'startups', 'available', 'using', 'specifics', 'responsibilities', 'following', 'inception', 'bonus', 'complete', 'including', 'package', 'chicago', 'health', 'delivered', 'choice', 'leverage']

['test', 'phases', 'automation', 'revel', 'optim

  
