<a href="https://colab.research.google.com/github/Mai1902/landing/blob/main/skills_cluster_ver2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install nltk
!pip install sklearn

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk
from sklearn import feature_extraction
from nltk.stem import PorterStemmer


In [None]:
# Load job description file
df = pd.read_csv('/dice_com-job_us_sample.csv')
im_df = pd.DataFrame(df, columns = ['company', 'employment', 'jobdescription', 'jobtitle', 'skills'])
data_dict = im_df.to_dict()
jd_content = [x for x in data_dict['jobdescription'].values()]
skills = [x for x in data_dict['skills'].values()]
skills_cleaned = []
for skill in skills:
  skill = str(skill)
  skill = skill.replace(',', '')
  skills_cleaned.append(skill)

jd_content_sample = jd_content[:10000]


In [10]:
from nltk.corpus import stopwords

In [13]:
#initialize stop words and stemmer for text processing
stopWords = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [32]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
'''Method to tokenize and stemmized the text'''
def tokenize_and_stemmized(text):
  tokens = []

  #Add all tokenized word into list of token
  for sent in nltk.sent_tokenize(text):
    for word in nltk.word_tokenize(sent):
      tokens.append(word)
  filtered_tokens = []

  # Filter out token that only contain letter, + sign,underscore, and number
  for token in tokens:
    if re.search("^[A-Za-z0-9+_-]*$", token):
      filtered_tokens.append(token)
  stems = [stemmer.stem(t) for t in filtered_tokens]
  return stems, filtered_tokens

stemmed_and_filtered = []
filtered_only = []
for jd in jd_content:
  stemmed_and_filtered = tokenize_and_stemmized(jd)[0]
  filtered_only = tokenize_and_stemmized(jd)[1]


In [64]:
print(filtered_only[:20])

['Experience', 'in', 'ProgrammingDevelopment', 'experience', 'in', 'Win32', 'Programming', 'on', 'Win', 'must', 'have', 'Experience', 'using', 'debuggers', 'such', 'as', 'WinDbgExperience', 'on', 'Windows', 'kernel']


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# define vectorizer parameters
jd_forfit = [[jd for jd in jd_content_sample]]
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stemmized, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(jd_forfit)
print(tfidf_matrix.shape)

In [54]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [55]:
# Switch to LDA approach
from gensim import corpora, models, similarities


In [None]:
!pip install rake_nltk

In [92]:
from rake_nltk import Rake

In [126]:
# Get keyword only from jd_content using Rake
rake = Rake()
def get_kw_rake(jd_content):
  for jd in jd_content:
    rake.extract_keywords_from_text(jd)
  keywords = rake.get_ranked_phrases()
  return keywords


In [None]:
!pip install keybert

In [135]:
from keybert import KeyBERT

In [143]:
# Get keyword only from jd_content using keyBert
bert = KeyBERT()
def get_kw_bert(jd_content):
    for jd in jd_content:
      keywords = bert.extract_keywords(jd, keyphrase_ngram_range=(3, 5), stop_words="english", top_n=20)
    results = []
    for scored_keywords in keywords:
        for keyword in scored_keywords:
              results.append(keyword)
    return results 

In [None]:
!pip install git+https://github.com/LIAAD/yake

In [146]:
import yake

In [148]:
# Get keyword only from jd_content using YAKE
def get_kw_yake(jd_content):
    for jd in jd_content:
      keywords = yake.KeywordExtractor(lan="en", n=3, windowsSize=3, top=20).extract_keywords(jd)
    results = []
    for scored_keywords in keywords:
        for keyword in scored_keywords:
            if isinstance(keyword, str):
                results.append(keyword) 
    return results 

In [210]:
#keywords = get_kw_rake(jd_content[:1000])
# remove stop words from list of tokenized and stemmed token
#filtered_only = [[word for word in tokenize_and_stemmized(keys)[1]] for keys in keywords]
filtered_only = [[word for word in tokenize_and_stemmized(skills)[1]] for skills in skills_cleaned[:500]]

cleaned_jd = [[word for word in text if word not in stopWords] for text in filtered_only]



In [211]:
# Create a Gensim dictionary from the processced job description
dictionary = corpora.Dictionary(cleaned_jd)

# Remove the extreme vocab based on term frequency
dictionary.filter_extremes(no_below=1,no_above=0.9)

#convert dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(jd) for jd in cleaned_jd]

In [212]:
print(dictionary)

Dictionary(1779 unique tokens: ['BELOW', 'SEE', 'accessment', 'administration', 'incident']...)


In [None]:
# Create LDA model with 20 different clusters
lda = models.LdaModel(corpus, num_topics=20, 
                            id2word=dictionary, 
                            update_every=5, 
                            chunksize=10000, 
                            passes=100)
lda.show_topics()

In [214]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix = np.array(topics_matrix)

topic_words = topics_matrix[:,1]
for i in topic_words:
    print([str(word[0]) for word in i])
    print()

['-', 'Software', 'data', 'C++', 'MS', 'Technical', 'Experience', 'customers', 'Support', 'software', 'Engineer', 'Windows', 'Pre-Sales', 'engineering', 'Amazon', 'EE', 'Product', 'Customer', 'Linux', 'experience']

['Design', 'data', 'business', 'development', 'FBA', 'analysis', 'Spring', 'product', 'management', 'Hibernate', 'The', 'Amazon', 'recommendations', 'develop', 'including', 'global', 'System', 'project', 'pricing', 'projects']

['delivering', 'including', 'PGS', 'specializes', 'workforce', 'secure', 'reliable', 'rapidly', 'solutions', 'Federal', 'prime', 'marketplace', 'implemented', 'contractors', 'agencies', 'Wi', 'TAD', 'Government', 'MongoDB', 'SCRUM']

['Business', 'Analyst', 'Requirements', 'services', 'Supervision', 'Robotic', 'Office', 'MS', 'Data', 'J2EE', 'System', 'Application', 'IT', 'Marketing', 'NoSQL', 'wireless', 'CRM', 'Voip', 'Engineer', 'restful']

['Ruby', 'Rails', 'JavaScript', 'web', 'On', 'applications', '-', 'Electronics', 'Engineer', 'services', 'Em

  
