In [None]:
#initializing libraries
import numpy as np
import pandas as pd
import nltk
import re
import os

from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

In [None]:
# read scraped data 
df_jobs = pd.read_csv('Indeed_data_small.csv')
job_description = open('JD.txt').read().split('\n BREAKS HERE')
job_description = job_description[:-1]
print(len(job_description))
print(type(job_description[0]))

## Tokenizing and Stemming

Load stopwords and stemmer function from NLTK library. Stop words are words like "a", "the", or "in" which don't convey significant meaning. Stemming is the process of breaking a word down into its root.


In [None]:
# Use nltk's English stopwords.
english_stopwords = nltk.corpus.stopwords.words('english')

print("We use " + str(len(english_stopwords)) + " stop-words from nltk library.")
print(english_stopwords[:10])

In [None]:
#Tokenization and Lemmatization using stopwords

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def tokenization_and_lemmatization(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word not in english_stopwords]
    filtered_tokens = []
    # filtering out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma = [wordnet_lemmatizer.lemmatize(t) for t in filtered_tokens]
    return lemma


def tokenization(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word not in english_stopwords]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

Use our defined functions to analyze (i.e. tokenize, stem) our synopses.

In [None]:
docs_lemmatize = []
docs_tokenized = []
for s in job_description:
    s = s.encode()
    s = s.decode('utf-8')
    tokenized_and_stemmed_results = tokenization_and_lemmatization(s)
    docs_lemmatize.extend(tokenized_and_stemmed_results)
    tokenized_results = tokenization(s)
    docs_tokenized.extend(tokenized_results)

In [None]:
vocab_frame_dict = {docs_lemmatize[x]:docs_tokenized[x] for x in range(len(docs_lemmatize))}
print(vocab_frame_dict)

Create a mapping from stemmed words to original tokenized words for result interpretation.

# TF-IDF

In [None]:
#define vectorizer parameters
tfidf_model = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenization_and_lemmatization, ngram_range=(1,1))

#fit the vectorizer to job description
tfidf_matrix = tfidf_model.fit_transform(job_description)

print("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " job postings and " + str(tfidf_matrix.shape[1]) + " terms.")

In [None]:
tfidf_model.get_params()

Save the terms identified by TF-IDF.

In [None]:
tf_selected_words = tfidf_model.get_feature_names_out()
print(tf_selected_words)

# K-means Clustering

In [None]:
#fitting clusters
from sklearn.cluster import KMeans

num_clusters = 4
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [None]:
print(len(clusters))
print(clusters)

## Check K-means results

In [None]:
# create DataFrame films from all the input files.
np.array(clusters)  
df_jobs['cluster'] = pd.Series(np.array(clusters) , index=df_jobs.index)
df_jobs.head(6000)
df_jobs.to_csv('kmeans_result.csv', encoding='utf-8')

In [None]:
# convert search to ints
cleanup_nums = {"Search":     {'Data+scientist': 0, 'Machine+learning engineer': 1, 'Data+analyst': 2},}
df_jobs.replace(cleanup_nums, inplace=True)
df_jobs.head()

In [None]:
print("Number of jobs included in each cluster:")
df_jobs['cluster'].value_counts().to_frame()

In [None]:
# km.cluster_centers_ denotes the importance of each item in centroid.
# need to sort it in descending order and get the top k items.

print("<Document clustering result by K-means>")

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

cluster_keywords_summary = {}
for i in range(num_clusters):
    print("Cluster " + str(i) + " words:")
    cluster_keywords_summary[i] = []
    for words in order_centroids[i, :30]: # get the top 6 words of each cluster
        cluster_keywords_summary[i].append(vocab_frame_dict[tf_selected_words[words]])
        print(vocab_frame_dict[tf_selected_words[words]] + ", ")
    print 

    cluster_jobs = df_jobs.loc[df_jobs.cluster == i, 'Title'].values.tolist()
    print("Cluster " + str(i) + " titles (" + str(len(cluster_jobs)) + " jobs): " )

In [None]:
print(len(order_centroids[1,:]))

## Plot result

In [None]:
pca = decomposition.PCA(n_components=4)
tfidf_matrix_np=tfidf_matrix.toarray()
pca.fit(tfidf_matrix_np)
X = pca.transform(tfidf_matrix_np)

xs, ys = X[:, 0], X[:, 1]

#set up colors per clusters using a dict
cluster_colors = {0: 'g', 1: 'b', 2: 'r', 3: 'y', 4:'k',5:'m'}
#set up cluster names using a dict
cluster_names = {}
for i in range(num_clusters):
    cluster_names[i] = ", ".join(cluster_keywords_summary[i])

In [None]:
%matplotlib inline 

#create data frame with PCA cluster results
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) 
groups = df.groupby(clusters)

# set up plot
fig, ax = plt.subplots(figsize=(16, 10))
#Set color for each cluster/group
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=8, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')

ax.legend(numpoints=1,loc=4)  #show legend with only 1 point, position is right bottom.

plt.show() #show the plot

In [None]:
%matplotlib inline 
search_names = ['Machine+learning engineer', 'Data+scientist', 'Data+analyst']
#create data frame with PCA cluster results
search_num = df_jobs['Search'].tolist()
df_indeed = pd.DataFrame(dict(x=xs, y=ys, label=search_num)) 
groups2 = df_indeed.groupby(search_num)

# set up plot
cluster_colors2 = {0: 'g', 1: 'b', 2: 'r', 3: 'y', 4:'k',5:'m'}

fig, ax = plt.subplots(figsize=(16, 10))
#Set color for each cluster/group
for name, group in groups2:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=8, 
            label=search_names[name], color=cluster_colors2[name], 
            mec='none')

ax.legend(numpoints=1,loc=4)  #show legend with only 1 point, position is right bottom.

plt.show() #show the plot