<a href="https://colab.research.google.com/github/Mai1902/landing/blob/main/skills_cluster_kmean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install nltk
!pip install sklearn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk
from sklearn import feature_extraction
from nltk.stem import PorterStemmer


In [45]:
# Load job description file
df = pd.read_csv('/dice_com_techjob_post.csv')
im_df = pd.DataFrame(df, columns = ['company', 'employment', 'jobdescription', 'jobtitle', 'skills'])
data_dict = im_df.to_dict()
jd_content = [x for x in data_dict['jobdescription'].values()]
skills = [x for x in data_dict['skills'].values()]
skills_cleaned = []
for skill in skills:
  skill = str(skill)
  skill = skill.replace(',', '')
  skills_cleaned.append(skill)
jd_content_sample = jd_content[:10000]


['SEE BELOW', 'linux/unix network monitoring incident response systems administration security accessment', 'Enterprise Solutions Architecture business inteligence reports reporting', 'Please see job description', 'Configuration Management Developer Linux Management Process Engineering VMware', 'FICO AR AP Asset Management HAHA', 'Cisco DNS HTTP Networking Network Engineer Security Video VPN Wireless', '.NET  C# MVC RESTful web services HTTP AWS Azure OOP', 'C++ Developer Development JavaScript User Interface', '(See Job Description)', 'Openstack', 'Unix IAM Scripting knowledge OIM Windows Linux', 'Java OSS', 'Consulting Project Sales Sales Engineer', 'mobile device', '(See Job Description)', 'QA', 'Lawson Supply Chain', 'Desktop Hardware Network Software Systems Windows', '(See Job Description)', 'Analysis Analyst Application Business Analyst Business Requirements Excel IT PowerPoint Project', '(See Job Description)', 'System Testing Load/Performance Testing FIX.FAST C/C++ Java Window

In [6]:
from nltk.corpus import stopwords

In [None]:
#initialize stop words and stemmer for text processing
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [None]:
nltk.download('punkt')

In [56]:
'''Method to tokenize and stemmized the text'''
def tokenize_and_stemmized(text):
  tokens = []

  #Add all tokenized word into list of token
  for sent in nltk.sent_tokenize(text):
    for word in nltk.word_tokenize(sent):
      tokens.append(word)
  filtered_tokens = []

  # Filter out token that only contain letter, + sign,underscore, and number
  for token in tokens:
    if re.search("^[A-Za-z0-9+_-]*$", token):
      filtered_tokens.append(token)
  stems = [stemmer.stem(t) for t in filtered_tokens]
  return ' '.join(stems)

def filter_token(tokens):
   filtered_tokens = []
   for token in tokens:
    if re.search("^[A-Za-z0-9+_-]*$", token):
      filtered_tokens.append(token)
   return filtered_tokens



In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# prepare a data frame of only skills and job title to train
col = ['jobtitle', 'skills']
data_eval = im_df[col]

# Drop rows with missing data
data_eval.dropna(subset=['skills'], inplace=True)

data_forfit = data_eval['skills']
print(data_eval['skills'].head)


In [None]:
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(sublinear_tf = True, max_df=0.9, max_features=200000,
                                 min_df=0.1, use_idf=True, stop_words= 'english',
                                 tokenizer = tokenize_and_stemmized, ngram_range=(1,2))

tfidf_matrix = tfidf_vectorizer.fit_transform(data_forfit)
print(tfidf_matrix.shape)

In [65]:
from sklearn.cluster import KMeans

In [74]:
# generate k-cluster

num_clusters = 20

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

clusters = km.predict(tfidf_matrix)

In [100]:
#add cluster name into the df
data_eval["ClusterName"] = clusters
data_eval.head(20)
# get top skills per cluster:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = tfidf_vectorizer.get_feature_names()
for i in range(num_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :num_clusters]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))


Cluster 0: t e c c t e c h e r h   r r c t   h t o c t u n l u r u t   t
Cluster 1: s t e s t r m c u   s m   t e n o t   d s e e s r   l   t
Cluster 2: g n n e n g e n g   e n   t s r o d e s c d e l m u p
Cluster 3: d   n d d n   n o r e c t l o d s m   o p b u r o r  
Cluster 4: l y n l y s y s   n t l n s t s   t   u s e u r b c o d
Cluster 5: r k w o n e t w k e t w k   e r o n o r t c s   n u m   s
Cluster 6: q l s q q s l l     s e r t o c e r m d p v b n c  
Cluster 7: e o r t n c s l p d u m f t   r   h n   d     c   s
Cluster 8: j j v v     j v e r s n o l c p g t   s s p d b e  
Cluster 9: w o w d o e o d r n d n s   w t c v e r m r e r   l p
Cluster 10: j e m n p r r o n g   m c t g m e c r c p j g   n o t   e t
Cluster 11: b   p t r p e s s c   j c r e d e j   d s b r e   o p t d s e
Cluster 12: w e e r v b   b s e r v r   w w s e   s c j c     j o t l
Cluster 13: x   x l n n u l   l e r t s p o c u n   u h d m   s
Cluster 14: p s p s p   o c h t r m e   p   s n l b d   c 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [94]:
# return the job title in each cluster:
dict_job = {}

for ind in data_eval.index:
  i = data_eval['ClusterName'][ind]
  if i not in dict_job.keys():
    dict_job[i] = []
    dict_job[i].append(data_eval['jobtitle'][ind])
  else:
    dict_job[i].append(data_eval['jobtitle'][ind])

for key in dict_job.keys():
  print(dict_job[key][:20])
  print('\n')



['AUTOMATION TEST ENGINEER', 'Sr. Systems Test Engineer (PERM)', 'Business Systems Analyst', 'Network Administrator - Cisco UCS, VMware, Citrix, Windows', 'VM Ware Consultant - VCP Certified', 'Support Engineer - Cloud Computing', 'Systems Administrator - data center operations w/windows & linux', 'Information Systems/Support Engineer- Direct Hire', 'Business Analyst', 'Adobe AME Architect', 'Mid-level Windows DevOps Engineer', 'Informatica developer', 'Systems Administrator - Windows, PowerShell, Azure', 'DATABASE CONSULTANT', 'Network Engineer', 'Data Power Admin', 'BigData Architect', 'Systems Administrator', 'Network Support / Server Support / Technical Support', 'Systems Lead']


['Information Security Engineer', 'Network Engineer', 'Messaging Administrator', 'Domestic Outsourcing Business Development Executive', 'Domestic Outsourcing Business Development Executive', 'Oracle Business Systems Analyst', 'Linux Engineer', 'Capacity Planning Engineer - 11350', 'Director of IS, Infrast

In [None]:
!pip install gensim

In [96]:
# Switch to LDA approach
from gensim import corpora, models, similarities


In [None]:
!pip install rake_nltk

In [98]:
from rake_nltk import Rake

In [99]:
# Get keyword only from jd_content using Rake
rake = Rake()
def get_kw_rake(jd_content):
  for jd in jd_content:
    rake.extract_keywords_from_text(jd)
  keywords = rake.get_ranked_phrases()
  return keywords


In [None]:
!pip install keybert

In [None]:
from keybert import KeyBERT

In [None]:
# Get keyword only from jd_content using keyBert
bert = KeyBERT()
def get_kw_bert(jd_content):
    for jd in jd_content:
      keywords = bert.extract_keywords(jd, keyphrase_ngram_range=(3, 5), stop_words="english", top_n=20)
    results = []
    for scored_keywords in keywords:
        for keyword in scored_keywords:
              results.append(keyword)
    return results 

In [None]:
!pip install git+https://github.com/LIAAD/yake

In [None]:
import yake

In [None]:
# Get keyword only from jd_content using YAKE
def get_kw_yake(jd_content):
    for jd in jd_content:
      keywords = yake.KeywordExtractor(lan="en", n=3, windowsSize=3, top=20).extract_keywords(jd)
    results = []
    for scored_keywords in keywords:
        for keyword in scored_keywords:
            if isinstance(keyword, str):
                results.append(keyword) 
    return results 

In [None]:
#keywords = get_kw_rake(jd_content[:1000])
# remove stop words from list of tokenized and stemmed token
#filtered_only = [[word for word in tokenize_and_stemmized(keys)[1]] for keys in keywords]
filtered_only = [[word for word in tokenize_and_stemmized(skills)[1]] for skills in skills_cleaned[:500]]

cleaned_jd = [[word for word in text if word not in stopWords] for text in filtered_only]



In [None]:
# Create a Gensim dictionary from the processced job description
dictionary = corpora.Dictionary(cleaned_jd)

# Remove the extreme vocab based on term frequency
dictionary.filter_extremes(no_below=1,no_above=0.9)

#convert dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(jd) for jd in cleaned_jd]

In [None]:
print(dictionary)

In [None]:
# Create LDA model with 20 different clusters
lda = models.LdaModel(corpus, num_topics=20, 
                            id2word=dictionary, 
                            update_every=5, 
                            chunksize=10000, 
                            passes=100)
lda.show_topics()

In [None]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix = np.array(topics_matrix)

topic_words = topics_matrix[:,1]
for i in topic_words:
    print([str(word[0]) for word in i])
    print()

['-', 'Software', 'data', 'C++', 'MS', 'Technical', 'Experience', 'customers', 'Support', 'software', 'Engineer', 'Windows', 'Pre-Sales', 'engineering', 'Amazon', 'EE', 'Product', 'Customer', 'Linux', 'experience']

['Design', 'data', 'business', 'development', 'FBA', 'analysis', 'Spring', 'product', 'management', 'Hibernate', 'The', 'Amazon', 'recommendations', 'develop', 'including', 'global', 'System', 'project', 'pricing', 'projects']

['delivering', 'including', 'PGS', 'specializes', 'workforce', 'secure', 'reliable', 'rapidly', 'solutions', 'Federal', 'prime', 'marketplace', 'implemented', 'contractors', 'agencies', 'Wi', 'TAD', 'Government', 'MongoDB', 'SCRUM']

['Business', 'Analyst', 'Requirements', 'services', 'Supervision', 'Robotic', 'Office', 'MS', 'Data', 'J2EE', 'System', 'Application', 'IT', 'Marketing', 'NoSQL', 'wireless', 'CRM', 'Voip', 'Engineer', 'restful']

['Ruby', 'Rails', 'JavaScript', 'web', 'On', 'applications', '-', 'Electronics', 'Engineer', 'services', 'Em

  
