# Environment

In [3]:
## use instance ml.m5.large for SentenceTransformers
!pip install -U sentence-transformers
!pip install kneed

[0m

In [2]:
import pickle
import warnings
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt

from functools import reduce
# from kneed import KneeLocator
from sklearn.cluster import KMeans
from collections import defaultdict
from plotly.subplots import make_subplots
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer

%matplotlib inline 
pio.renderers.default='iframe'
warnings.filterwarnings(action = 'ignore')

# Setup

In [3]:
def setup():
    file_path = "Top 50 skills (By Year and ONET _ with NAICS filter).xlsx"
    new_columns = ['onet_parent','onet_child']
    df_1 = pd.read_excel(file_path, sheet_name='2010')
    df_1[new_columns] = df_1['onet'].str.split('-', 1, expand=True)
    df_2 = pd.read_excel(file_path, sheet_name='2011')
    df_2[new_columns] = df_2['onet'].str.split('-', 1, expand=True)
    df_3 = pd.read_excel(file_path, sheet_name='2012')
    df_3[new_columns] = df_3['onet'].str.split('-', 1, expand=True)
    df_4 = pd.read_excel(file_path, sheet_name='2013')
    df_4[new_columns] = df_4['onet'].str.split('-', 1, expand=True)
    df_5 = pd.read_excel(file_path, sheet_name='2014')
    df_5[new_columns] = df_5['onet'].str.split('-', 1, expand=True)
    df_6 = pd.read_excel(file_path, sheet_name='2015')
    df_6[new_columns] = df_6['onet'].str.split('-', 1, expand=True)
    df_7 = pd.read_excel(file_path, sheet_name='2016')
    df_7[new_columns] = df_7['onet'].str.split('-', 1, expand=True)
    df_8 = pd.read_excel(file_path, sheet_name='2017')
    df_8[new_columns] = df_8['onet'].str.split('-', 1, expand=True)
    df_9 = pd.read_excel(file_path, sheet_name='2018')
    df_9[new_columns] = df_9['onet'].str.split('-', 1, expand=True)
    df_10 = pd.read_excel(file_path, sheet_name='2019')
    df_10[new_columns] = df_10['onet'].str.split('-', 1, expand=True)
    df_11 = pd.read_excel(file_path, sheet_name='2020')
    df_11[new_columns] = df_11['onet'].str.split('-', 1, expand=True)
    df_12 = pd.read_excel(file_path, sheet_name='2021')
    df_12[new_columns] = df_12['onet'].str.split('-', 1, expand=True)
    return df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_12

In [6]:
def find_intersection() -> list:
    return (list((reduce(np.intersect1d,
                         [df_2010['onet'], df_2011['onet'], df_2012['onet'], df_2013['onet'], df_2014['onet'],
                          df_2015['onet'], df_2016['onet'], df_2017['onet'], df_2018['onet'], df_2019['onet'],
                          df_2020['onet'], df_2021['onet']]))))

In [7]:
def find_union() -> list:
    return (list((reduce(np.union1d,
                         [df_2010['onet'], df_2011['onet'], df_2012['onet'], df_2013['onet'], df_2014['onet'],
                          df_2015['onet'], df_2016['onet'], df_2017['onet'], df_2018['onet'], df_2019['onet'],
                          df_2020['onet'], df_2021['onet']]))))

In [4]:
df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020, df_2021 = setup()
dfs = [df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020, df_2021]

years = list(range(2010, 2022))
yearly_dfs = {2010: df_2010, 2011: df_2011, 2012: df_2012, 2013: df_2013, 2014: df_2014, 2015: df_2015, 
              2016: df_2016, 2017: df_2017, 2018: df_2018, 2019: df_2019, 2020: df_2020, 2021: df_2021}

# EDA

In [9]:
print("Total number of ONET codes: {}".format(len(find_union())))

Total number of ONET codes: 155


In [10]:
intersection = find_intersection()
print("Number of ONET codes available in all years: {}".format(len(intersection)))
print(intersection)
# in intersection: 15-1199.01

Number of ONET codes available in all years: 99
['13-1022.00', '13-1023.00', '13-1031.01', '13-1041.07', '13-1051.00', '13-1071.00', '13-1081.00', '13-1081.02', '13-1111.00', '13-1121.00', '13-1141.00', '13-1151.00', '13-1161.00', '13-1199.00', '13-1199.01', '13-1199.02', '13-1199.04', '13-1199.06', '13-2011.01', '13-2011.02', '13-2041.00', '13-2051.00', '13-2052.00', '13-2053.00', '13-2071.00', '13-2072.00', '13-2081.00', '13-2099.01', '13-2099.02', '13-2099.03', '13-2099.04', '15-1111.00', '15-1121.00', '15-1122.00', '15-1131.00', '15-1132.00', '15-1133.00', '15-1134.00', '15-1141.00', '15-1142.00', '15-1143.00', '15-1143.01', '15-1151.00', '15-1152.00', '15-1199.00', '15-1199.01', '15-1199.02', '15-1199.03', '15-1199.04', '15-1199.06', '15-1199.07', '15-1199.08', '15-1199.09', '15-1199.10', '15-1199.11', '15-1199.12', '15-2031.00', '15-2041.00', '15-2041.01', '27-1011.00', '27-1013.00', '27-1014.00', '27-1021.00', '27-1022.00', '27-1024.00', '27-1025.00', '27-1026.00', '27-1029.00',

In [11]:
union = find_union()
print(set(union) - set(intersection))
# not in intersection: 13-2031.00

{'13-1081.01', '27-2041.04', '27-2012.04', '41-9091.00', '27-4013.00', '27-1012.00', '41-3031.03', '15-2021.00', '13-1032.00', '27-2032.00', '13-1021.00', '41-9012.00', '13-1011.00', '41-2012.00', '13-2021.01', '13-1131.00', '13-1041.02', '27-2042.01', '27-2041.01', '41-4011.07', '41-2021.00', '27-2011.00', '27-2012.05', '13-2082.00', '15-1199.05', '27-2012.02', '41-9021.00', '27-2099.00', '13-1031.02', '27-2021.00', '27-2042.02', '13-1041.06', '13-2071.01', '41-3031.01', '41-3041.00', '13-1199.03', '27-3011.00', '27-3012.00', '15-1121.01', '41-9041.00', '13-1199.05', '27-1023.00', '15-2011.00', '41-3099.01', '13-2061.00', '15-2041.02', '13-2021.02', '41-2022.00', '13-1075.00', '27-1027.00', '27-3021.00', '13-1041.03', '13-1041.04', '13-1041.01', '13-2031.00', '27-2031.00'}


# CASE 1: ONET Codes 15-*

In [12]:
!rm -rf '15'
!mkdir '15'

K = range(2,30)
onet_parent = '15'
title = 'Trending analysis for the ONET Code 15-*'
onet_model = onet_parent+'/'+onet_parent+'_model.pkl'
# language_model = 'all-MiniLM-L6-v2'
language_model = 'all-mpnet-base-v2'
onet_k_value = onet_parent+'/'+onet_parent+'_k_value.png'
onet_cluster_data = onet_parent+'/'+onet_parent+'_cluster_data'
onet_bar_chart = onet_parent+'/'+onet_parent+'_bar_charts.html'
onet_line_chart = onet_parent+'/'+onet_parent+'_line_chart.html'
onet_bubble_chart = onet_parent+'/'+onet_parent+'_bubble_chart.html'

## Create Embeddings

In [13]:
## Generate list of skills
skills = []
skills_per_year = []
for df in dfs:
    skills.extend(df[df['onet_parent'] == onet_parent]['skill'].unique().tolist())
    skills_per_year.append(df[df['onet_parent'] == onet_parent]['skill'].unique().tolist())
skills = list(set(skills))

In [14]:
## Create Embeddings
model = SentenceTransformer(language_model)
skills_embeddings = model.encode(skills)
skills_per_year_embeddings = []
for item in skills_per_year:
    skills_per_year_embeddings.append(model.encode(item))

## k-means clustering

### Fit a Model

In [15]:
# using elbow method to figure out ideal number of clusters
# distortions = []
score=0
for k in K:
    model = KMeans(n_clusters=k, init='k-means++')
    model.fit_predict(skills_embeddings)
#     distortions.append(model.inertia_)
    new_score = silhouette_score(skills_embeddings, model.labels_, metric='euclidean')
    if new_score>score:
        number_of_clusters = k
        score=new_score

print("Ideal K value based on silhouette score is {} with a score of {}".format(number_of_clusters, score))
# kn = KneeLocator(list(K), distortions, S=1.0, curve='convex', direction='decreasing')
# number_of_clusters = kn.knee
# print("Ideal K value based on elbow method = {}".format(number_of_clusters))
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.plot(K, distortions, 'bx-')
# plt.vlines(number_of_clusters, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
# plt.savefig(onet_k_value, bbox_inches='tight')

Ideal K value based on silhouette score is 24 with a score of 0.04645084962248802


### Metrics

In [16]:
model = KMeans(n_clusters=number_of_clusters)
cluster_assignment = model.fit_predict(skills_embeddings)

clustered_skills = {}
for skill_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_skills:
        clustered_skills[cluster_id] = []
    clustered_skills[cluster_id].append(skills[skill_id])

score = silhouette_score(skills_embeddings, model.labels_, metric='euclidean')
print('Silhouette Score: %.3f' % score)

Silhouette Score: 0.041


### Save Output

In [17]:
cluster_data = "{'silhouette_score':'"+str(score)+"',"
for key in sorted(clustered_skills):
    cluster_data=cluster_data+str(key)+":"+str(clustered_skills[key])+","
cluster_data=cluster_data[:-1]+"}"
with open(onet_cluster_data+".json", 'w') as f:
    f.write(cluster_data)

In [18]:
with open(onet_model, "wb") as f:
    pickle.dump(model, f)
with open(onet_model, "rb") as f:
    model = pickle.load(f)

### Predict each year individually

In [19]:
df = pd.DataFrame(columns=['year', 'skills', 'cluster', 
                           'cluster_centers', 'cluster_centers_x', 'cluster_centers_y', 
                           'cluster_skill_count'], dtype=object)

for i in range(0, len(skills_per_year_embeddings)):
    year=years[i]
    print("Predicting for year {}".format(year))
    cluster_assignment = model.predict(skills_per_year_embeddings[i])

    clustered_skills = {}
    for skill_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_skills:
            clustered_skills[cluster_id] = []
        clustered_skills[cluster_id].append(skills_per_year[i][skill_id])
    
    for key in sorted(clustered_skills):
        count = 0
        for skill in clustered_skills[key]:
            count += yearly_dfs[year].loc[yearly_dfs[year]['skill'] == skill, 'count'].sum()
        row = {'year': year, 'skills':', '.join(clustered_skills[key]), 'cluster':key, 
               'cluster_centers': key+1,'cluster_centers_x': model.cluster_centers_[:,0],
               'cluster_centers_y': model.cluster_centers_[:,1], 'cluster_skill_count':count}
        df = df.append(row, ignore_index=True)
        
df.year = df.year.astype(int)
df.cluster = df.cluster.astype(int)
df.cluster_skill_count = df.cluster_skill_count.astype(int)

Predicting for year 2010
Predicting for year 2011
Predicting for year 2012
Predicting for year 2013
Predicting for year 2014
Predicting for year 2015
Predicting for year 2016
Predicting for year 2017
Predicting for year 2018
Predicting for year 2019
Predicting for year 2020
Predicting for year 2021


In [20]:
df.to_csv(onet_cluster_data+".csv")

### Visualizations

In [6]:
df = pd.read_csv(onet_cluster_data+".csv")

In [None]:
fig = px.scatter(
    df,
    x='cluster_centers',
    y='cluster_centers',
    animation_frame='year',
    animation_group='cluster',
    size='cluster_skill_count',
    color='cluster',
    hover_name="cluster",
    hover_data={ 'year':False, 'cluster_centers':False, 'cluster':False, 'skills': True},
    labels={'cluster_skill_count': 'Count', 'skills': 'Skills', 'cluster_centers': "Cluster"},
    log_x=False,
    range_x=[0, number_of_clusters+1],
    range_y=[0, number_of_clusters+1],
    title=title
)
fig.update(layout_coloraxis_showscale=True)
# fig.show()
fig.write_html(onet_bubble_chart)

In [23]:
labels={'cluster_skill_count': 'Count', 'cluster': "Cluster"}
figures = [
    px.bar(df[df['year']==2010], x='cluster', y='cluster_skill_count', title='2010', labels=labels),
    px.bar(df[df['year']==2011], x='cluster', y='cluster_skill_count', title='2011', labels=labels),
    px.bar(df[df['year']==2012], x='cluster', y='cluster_skill_count', title='2012', labels=labels),
    px.bar(df[df['year']==2013], x='cluster', y='cluster_skill_count', title='2013', labels=labels),
    px.bar(df[df['year']==2014], x='cluster', y='cluster_skill_count', title='2014', labels=labels),
    px.bar(df[df['year']==2015], x='cluster', y='cluster_skill_count', title='2015', labels=labels),
    px.bar(df[df['year']==2016], x='cluster', y='cluster_skill_count', title='2016', labels=labels),
    px.bar(df[df['year']==2017], x='cluster', y='cluster_skill_count', title='2017', labels=labels),
    px.bar(df[df['year']==2018], x='cluster', y='cluster_skill_count', title='2018', labels=labels),
    px.bar(df[df['year']==2019], x='cluster', y='cluster_skill_count', title='2019', labels=labels),
    px.bar(df[df['year']==2020], x='cluster', y='cluster_skill_count', title='2020', labels=labels),
    px.bar(df[df['year']==2021], x='cluster', y='cluster_skill_count', title='2021', labels=labels)
]

fig = make_subplots(rows=6, cols=2, subplot_titles=range(2010, 2022)) 
        
for i, figure in enumerate(figures):
    if i==0:
        r, c = 1, 1
    elif i==1:
        r, c = 1, 2
    elif i==2:
        r, c = 2, 1
    elif i==3:
        r, c = 2, 2
    elif i==4:
        r, c = 3, 1
    elif i==5:
        r, c = 3, 2
    elif i==6:
        r, c = 4, 1
    elif i==7:
        r, c = 4, 2
    elif i==8:
        r, c = 5, 1
    elif i==9:
        r, c = 5, 2
    elif i==10:
        r, c = 6, 1
    elif i==11:
        r, c = 6, 2
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=r, col=c)

fig.update_layout(height=1500, width=1500, title_text=title)
fig.update_xaxes(tickmode='linear')
# fig.show()
fig.write_html(onet_bar_chart)

In [24]:
labels={'cluster_skill_count': 'Count', 'cluster': "Cluster"}

fig = px.line(df, x='year', y='cluster_skill_count', color='cluster', labels=labels)

fig.update_layout(height=500, width=1500, title_text=title)
fig.update_xaxes(tickmode='linear')
fig.write_html(onet_line_chart)

# CASE 2: ONET Codes 13-*

In [25]:
!rm -rf '13'
!mkdir '13'

K = range(2,30)
onet_parent = '13'
title = 'Trending analysis for the ONET Code 13-*'
onet_model = onet_parent+'/'+onet_parent+'_model.pkl'
# language_model = 'all-MiniLM-L6-v2'
language_model = 'all-mpnet-base-v2'
onet_k_value = onet_parent+'/'+onet_parent+'_k_value.png'
onet_cluster_data = onet_parent+'/'+onet_parent+'_cluster_data'
onet_bar_chart = onet_parent+'/'+onet_parent+'_bar_charts.html'
onet_line_chart = onet_parent+'/'+onet_parent+'_line_chart.html'
onet_bubble_chart = onet_parent+'/'+onet_parent+'_bubble_chart.html'

## Create Embeddings

In [26]:
## Generate list of skills
skills = []
skills_per_year = []
for df in dfs:
    skills.extend(df[df['onet_parent'] == onet_parent]['skill'].unique().tolist())
    skills_per_year.append(df[df['onet_parent'] == onet_parent]['skill'].unique().tolist())
skills = list(set(skills))

In [27]:
## Create Embeddings
model = SentenceTransformer(language_model)
skills_embeddings = model.encode(skills)
skills_per_year_embeddings = []
for item in skills_per_year:
    skills_per_year_embeddings.append(model.encode(item))

## k-means clustering

### Fit a Model

In [28]:
# using elbow method to figure out ideal number of clusters
# distortions = []
score=0
for k in K:
    model = KMeans(n_clusters=k, init='k-means++')
    model.fit_predict(skills_embeddings)
#     distortions.append(model.inertia_)
    new_score = silhouette_score(skills_embeddings, model.labels_, metric='euclidean')
    if new_score>score:
        number_of_clusters = k
        score=new_score

print("Ideal K value based on silhouette score is {} with a score of {}".format(number_of_clusters, score))
# kn = KneeLocator(list(K), distortions, S=1.0, curve='convex', direction='decreasing')
# number_of_clusters = kn.knee
# print("Ideal K value based on elbow method = {}".format(number_of_clusters))
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.plot(K, distortions, 'bx-')
# plt.vlines(number_of_clusters, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
# plt.savefig(onet_k_value, bbox_inches='tight')

Ideal K value based on silhouette score is 24 with a score of 0.04817359894514084


### Metrics

In [29]:
model = KMeans(n_clusters=number_of_clusters)
cluster_assignment = model.fit_predict(skills_embeddings)

clustered_skills = {}
for skill_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_skills:
        clustered_skills[cluster_id] = []
    clustered_skills[cluster_id].append(skills[skill_id])

score = silhouette_score(skills_embeddings, model.labels_, metric='euclidean')
print('Silhouette Score: %.3f' % score)

Silhouette Score: 0.046


### Save Output

In [30]:
cluster_data = "{'silhouette_score':'"+str(score)+"',"
for key in sorted(clustered_skills):
    cluster_data=cluster_data+str(key)+":"+str(clustered_skills[key])+","
cluster_data=cluster_data[:-1]+"}"
with open(onet_cluster_data+".json", 'w') as f:
    f.write(cluster_data)

In [31]:
with open(onet_model, "wb") as f:
    pickle.dump(model, f)
with open(onet_model, "rb") as f:
    model = pickle.load(f)

### Predict each year individually

In [32]:
df = pd.DataFrame(columns=['year', 'skills', 'cluster', 
                           'cluster_centers', 'cluster_centers_x', 'cluster_centers_y', 
                           'cluster_skill_count'], dtype=object)

for i in range(0, len(skills_per_year_embeddings)):
    year=years[i]
    print("Predicting for year {}".format(year))
    cluster_assignment = model.predict(skills_per_year_embeddings[i])

    clustered_skills = {}
    for skill_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_skills:
            clustered_skills[cluster_id] = []
        clustered_skills[cluster_id].append(skills_per_year[i][skill_id])
    
    for key in sorted(clustered_skills):
        count = 0
        for skill in clustered_skills[key]:
            count += yearly_dfs[year].loc[yearly_dfs[year]['skill'] == skill, 'count'].sum()
        row = {'year': year, 'skills':', '.join(clustered_skills[key]), 'cluster':key, 
               'cluster_centers': key+1,'cluster_centers_x': model.cluster_centers_[:,0],
               'cluster_centers_y': model.cluster_centers_[:,1], 'cluster_skill_count':count}
        df = df.append(row, ignore_index=True)
        
df.year = df.year.astype(int)
df.cluster = df.cluster.astype(int)
df.cluster_skill_count = df.cluster_skill_count.astype(int)

Predicting for year 2010
Predicting for year 2011
Predicting for year 2012
Predicting for year 2013
Predicting for year 2014
Predicting for year 2015
Predicting for year 2016
Predicting for year 2017
Predicting for year 2018
Predicting for year 2019
Predicting for year 2020
Predicting for year 2021


In [33]:
df.to_csv(onet_cluster_data+".csv")

### Visualizations

In [8]:
df = pd.read_csv(onet_cluster_data+".csv")

In [9]:
fig = px.scatter(
    df,
    x='cluster_centers',
    y='cluster_centers',
    animation_frame='year',
    animation_group='cluster',
    size='cluster_skill_count',
    color='cluster',
    hover_name="cluster",
    hover_data={ 'year':False, 'cluster_centers':False, 'cluster':False, 'skills': True},
    labels={'cluster_skill_count': 'Count', 'skills': 'Skills', 'cluster_centers': "Cluster"},
    log_x=False,
    range_x=[0, number_of_clusters+1],
    range_y=[0, number_of_clusters+1],
    title=title
)
fig.update(layout_coloraxis_showscale=True)
# fig.show()
fig.write_html(onet_bubble_chart)

In [10]:
labels={'cluster_skill_count': 'Count', 'cluster': "Cluster"}
figures = [
    px.bar(df[df['year']==2010], x='cluster', y='cluster_skill_count', title='2010', labels=labels),
    px.bar(df[df['year']==2011], x='cluster', y='cluster_skill_count', title='2011', labels=labels),
    px.bar(df[df['year']==2012], x='cluster', y='cluster_skill_count', title='2012', labels=labels),
    px.bar(df[df['year']==2013], x='cluster', y='cluster_skill_count', title='2013', labels=labels),
    px.bar(df[df['year']==2014], x='cluster', y='cluster_skill_count', title='2014', labels=labels),
    px.bar(df[df['year']==2015], x='cluster', y='cluster_skill_count', title='2015', labels=labels),
    px.bar(df[df['year']==2016], x='cluster', y='cluster_skill_count', title='2016', labels=labels),
    px.bar(df[df['year']==2017], x='cluster', y='cluster_skill_count', title='2017', labels=labels),
    px.bar(df[df['year']==2018], x='cluster', y='cluster_skill_count', title='2018', labels=labels),
    px.bar(df[df['year']==2019], x='cluster', y='cluster_skill_count', title='2019', labels=labels),
    px.bar(df[df['year']==2020], x='cluster', y='cluster_skill_count', title='2020', labels=labels),
    px.bar(df[df['year']==2021], x='cluster', y='cluster_skill_count', title='2021', labels=labels)
]

fig = make_subplots(rows=6, cols=2, subplot_titles=range(2010, 2022)) 
        
for i, figure in enumerate(figures):
    if i==0:
        r, c = 1, 1
    elif i==1:
        r, c = 1, 2
    elif i==2:
        r, c = 2, 1
    elif i==3:
        r, c = 2, 2
    elif i==4:
        r, c = 3, 1
    elif i==5:
        r, c = 3, 2
    elif i==6:
        r, c = 4, 1
    elif i==7:
        r, c = 4, 2
    elif i==8:
        r, c = 5, 1
    elif i==9:
        r, c = 5, 2
    elif i==10:
        r, c = 6, 1
    elif i==11:
        r, c = 6, 2
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=r, col=c)

fig.update_layout(height=1500, width=1500, title_text=title)
fig.update_xaxes(tickmode='linear')
# fig.show()
fig.write_html(onet_bar_chart)

In [11]:
labels={'cluster_skill_count': 'Count', 'cluster': "Cluster"}

fig = px.line(df, x='year', y='cluster_skill_count', color='cluster', labels=labels)

fig.update_layout(height=500, width=1500, title_text=title)
fig.update_xaxes(tickmode='linear')
fig.write_html(onet_line_chart)

# CASE 3: ONET Codes 41-*

In [37]:
!rm -rf '41'
!mkdir '41'

K = range(2,30)
onet_parent = '41'
title = 'Trending analysis for the ONET Code 41-*'
onet_model = onet_parent+'/'+onet_parent+'_model.pkl'
# language_model = 'all-MiniLM-L6-v2'
language_model = 'all-mpnet-base-v2'
onet_k_value = onet_parent+'/'+onet_parent+'_k_value.png'
onet_cluster_data = onet_parent+'/'+onet_parent+'_cluster_data'
onet_bar_chart = onet_parent+'/'+onet_parent+'_bar_charts.html'
onet_line_chart = onet_parent+'/'+onet_parent+'_line_chart.html'
onet_bubble_chart = onet_parent+'/'+onet_parent+'_bubble_chart.html'

## Create Embeddings

In [38]:
## Generate list of skills
skills = []
skills_per_year = []
for df in dfs:
    skills.extend(df[df['onet_parent'] == onet_parent]['skill'].unique().tolist())
    skills_per_year.append(df[df['onet_parent'] == onet_parent]['skill'].unique().tolist())
skills = list(set(skills))

In [39]:
## Create Embeddings
model = SentenceTransformer(language_model)
skills_embeddings = model.encode(skills)
skills_per_year_embeddings = []
for item in skills_per_year:
    skills_per_year_embeddings.append(model.encode(item))

## k-means clustering

### Fit a Model

In [40]:
# using elbow method to figure out ideal number of clusters
# distortions = []
score=0
for k in K:
    model = KMeans(n_clusters=k, init='k-means++')
    model.fit_predict(skills_embeddings)
#     distortions.append(model.inertia_)
    new_score = silhouette_score(skills_embeddings, model.labels_, metric='euclidean')
    if new_score>score:
        number_of_clusters = k
        score=new_score

print("Ideal K value based on silhouette score is {} with a score of {}".format(number_of_clusters, score))
# kn = KneeLocator(list(K), distortions, S=1.0, curve='convex', direction='decreasing')
# number_of_clusters = kn.knee
# print("Ideal K value based on elbow method = {}".format(number_of_clusters))
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.plot(K, distortions, 'bx-')
# plt.vlines(number_of_clusters, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
# plt.savefig(onet_k_value, bbox_inches='tight')

Ideal K value based on silhouette score is 22 with a score of 0.04604271799325943


### Metrics

In [41]:
model = KMeans(n_clusters=number_of_clusters)
cluster_assignment = model.fit_predict(skills_embeddings)

clustered_skills = {}
for skill_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_skills:
        clustered_skills[cluster_id] = []
    clustered_skills[cluster_id].append(skills[skill_id])

score = silhouette_score(skills_embeddings, model.labels_, metric='euclidean')
print('Silhouette Score: %.3f' % score)

Silhouette Score: 0.041


### Save Output

In [42]:
cluster_data = "{'silhouette_score':'"+str(score)+"',"
for key in sorted(clustered_skills):
    cluster_data=cluster_data+str(key)+":"+str(clustered_skills[key])+","
cluster_data=cluster_data[:-1]+"}"
with open(onet_cluster_data+".json", 'w') as f:
    f.write(cluster_data)

In [43]:
with open(onet_model, "wb") as f:
    pickle.dump(model, f)
with open(onet_model, "rb") as f:
    model = pickle.load(f)

### Predict each year individually

In [44]:
df = pd.DataFrame(columns=['year', 'skills', 'cluster', 
                           'cluster_centers', 'cluster_centers_x', 'cluster_centers_y', 
                           'cluster_skill_count'], dtype=object)

for i in range(0, len(skills_per_year_embeddings)):
    year=years[i]
    print("Predicting for year {}".format(year))
    cluster_assignment = model.predict(skills_per_year_embeddings[i])

    clustered_skills = {}
    for skill_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_skills:
            clustered_skills[cluster_id] = []
        clustered_skills[cluster_id].append(skills_per_year[i][skill_id])
    
    for key in sorted(clustered_skills):
        count = 0
        for skill in clustered_skills[key]:
            count += yearly_dfs[year].loc[yearly_dfs[year]['skill'] == skill, 'count'].sum()
        row = {'year': year, 'skills':', '.join(clustered_skills[key]), 'cluster':key, 
               'cluster_centers': key+1,'cluster_centers_x': model.cluster_centers_[:,0],
               'cluster_centers_y': model.cluster_centers_[:,1], 'cluster_skill_count':count}
        df = df.append(row, ignore_index=True)
        
df.year = df.year.astype(int)
df.cluster = df.cluster.astype(int)
df.cluster_skill_count = df.cluster_skill_count.astype(int)

Predicting for year 2010
Predicting for year 2011
Predicting for year 2012
Predicting for year 2013
Predicting for year 2014
Predicting for year 2015
Predicting for year 2016
Predicting for year 2017
Predicting for year 2018
Predicting for year 2019
Predicting for year 2020
Predicting for year 2021


In [45]:
df.to_csv(onet_cluster_data+".csv")

### Visualizations

In [12]:
df = pd.read_csv(onet_cluster_data+".csv")

In [13]:
fig = px.scatter(
    df,
    x='cluster_centers',
    y='cluster_centers',
    animation_frame='year',
    animation_group='cluster',
    size='cluster_skill_count',
    color='cluster',
    hover_name="cluster",
    hover_data={ 'year':False, 'cluster_centers':False, 'cluster':False, 'skills': True},
    labels={'cluster_skill_count': 'Count', 'skills': 'Skills', 'cluster_centers': "Cluster"},
    log_x=False,
    range_x=[0, number_of_clusters+1],
    range_y=[0, number_of_clusters+1],
    title=title
)
fig.update(layout_coloraxis_showscale=True)
# fig.show()
fig.write_html(onet_bubble_chart)

In [14]:
labels={'cluster_skill_count': 'Count', 'cluster': "Cluster"}
figures = [
    px.bar(df[df['year']==2010], x='cluster', y='cluster_skill_count', title='2010', labels=labels),
    px.bar(df[df['year']==2011], x='cluster', y='cluster_skill_count', title='2011', labels=labels),
    px.bar(df[df['year']==2012], x='cluster', y='cluster_skill_count', title='2012', labels=labels),
    px.bar(df[df['year']==2013], x='cluster', y='cluster_skill_count', title='2013', labels=labels),
    px.bar(df[df['year']==2014], x='cluster', y='cluster_skill_count', title='2014', labels=labels),
    px.bar(df[df['year']==2015], x='cluster', y='cluster_skill_count', title='2015', labels=labels),
    px.bar(df[df['year']==2016], x='cluster', y='cluster_skill_count', title='2016', labels=labels),
    px.bar(df[df['year']==2017], x='cluster', y='cluster_skill_count', title='2017', labels=labels),
    px.bar(df[df['year']==2018], x='cluster', y='cluster_skill_count', title='2018', labels=labels),
    px.bar(df[df['year']==2019], x='cluster', y='cluster_skill_count', title='2019', labels=labels),
    px.bar(df[df['year']==2020], x='cluster', y='cluster_skill_count', title='2020', labels=labels),
    px.bar(df[df['year']==2021], x='cluster', y='cluster_skill_count', title='2021', labels=labels)
]

fig = make_subplots(rows=6, cols=2, subplot_titles=range(2010, 2022)) 
        
for i, figure in enumerate(figures):
    if i==0:
        r, c = 1, 1
    elif i==1:
        r, c = 1, 2
    elif i==2:
        r, c = 2, 1
    elif i==3:
        r, c = 2, 2
    elif i==4:
        r, c = 3, 1
    elif i==5:
        r, c = 3, 2
    elif i==6:
        r, c = 4, 1
    elif i==7:
        r, c = 4, 2
    elif i==8:
        r, c = 5, 1
    elif i==9:
        r, c = 5, 2
    elif i==10:
        r, c = 6, 1
    elif i==11:
        r, c = 6, 2
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=r, col=c)

fig.update_layout(height=1500, width=1500, title_text=title)
fig.update_xaxes(tickmode='linear')
# fig.show()
fig.write_html(onet_bar_chart)

In [15]:
labels={'cluster_skill_count': 'Count', 'cluster': "Cluster"}

fig = px.line(df, x='year', y='cluster_skill_count', color='cluster', labels=labels)

fig.update_layout(height=500, width=1500, title_text=title)
fig.update_xaxes(tickmode='linear')
fig.write_html(onet_line_chart)

# CASE 4: ONET Codes 27-*

In [49]:
!rm -rf '27'
!mkdir '27'

K = range(2,30)
onet_parent = '27'
title = 'Trending analysis for the ONET Code 27-*'
onet_model = onet_parent+'/'+onet_parent+'_model.pkl'
# language_model = 'all-MiniLM-L6-v2'
language_model = 'all-mpnet-base-v2'
onet_k_value = onet_parent+'/'+onet_parent+'_k_value.png'
onet_cluster_data = onet_parent+'/'+onet_parent+'_cluster_data'
onet_bar_chart = onet_parent+'/'+onet_parent+'_bar_charts.html'
onet_line_chart = onet_parent+'/'+onet_parent+'_line_chart.html'
onet_bubble_chart = onet_parent+'/'+onet_parent+'_bubble_chart.html'

## Create Embeddings

In [50]:
## Generate list of skills
skills = []
skills_per_year = []
for df in dfs:
    skills.extend(df[df['onet_parent'] == onet_parent]['skill'].unique().tolist())
    skills_per_year.append(df[df['onet_parent'] == onet_parent]['skill'].unique().tolist())
skills = list(set(skills))

In [51]:
## Create Embeddings
model = SentenceTransformer(language_model)
skills_embeddings = model.encode(skills)
skills_per_year_embeddings = []
for item in skills_per_year:
    skills_per_year_embeddings.append(model.encode(item))

## k-means clustering

### Fit a Model

In [52]:
# using elbow method to figure out ideal number of clusters
# distortions = []
score=0
for k in K:
    model = KMeans(n_clusters=k, init='k-means++')
    model.fit_predict(skills_embeddings)
#     distortions.append(model.inertia_)
    new_score = silhouette_score(skills_embeddings, model.labels_, metric='euclidean')
    if new_score>score:
        number_of_clusters = k
        score=new_score

print("Ideal K value based on silhouette score is {} with a score of {}".format(number_of_clusters, score))
# kn = KneeLocator(list(K), distortions, S=1.0, curve='convex', direction='decreasing')
# number_of_clusters = kn.knee
# print("Ideal K value based on elbow method = {}".format(number_of_clusters))
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.plot(K, distortions, 'bx-')
# plt.vlines(number_of_clusters, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
# plt.savefig(onet_k_value, bbox_inches='tight')

Ideal K value based on silhouette score is 29 with a score of 0.046428415924310684


### Metrics

In [53]:
model = KMeans(n_clusters=number_of_clusters)
cluster_assignment = model.fit_predict(skills_embeddings)

clustered_skills = {}
for skill_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_skills:
        clustered_skills[cluster_id] = []
    clustered_skills[cluster_id].append(skills[skill_id])

score = silhouette_score(skills_embeddings, model.labels_, metric='euclidean')
print('Silhouette Score: %.3f' % score)

Silhouette Score: 0.046


### Save Output

In [54]:
cluster_data = "{'silhouette_score':'"+str(score)+"',"
for key in sorted(clustered_skills):
    cluster_data=cluster_data+str(key)+":"+str(clustered_skills[key])+","
cluster_data=cluster_data[:-1]+"}"
with open(onet_cluster_data+".json", 'w') as f:
    f.write(cluster_data)

In [55]:
with open(onet_model, "wb") as f:
    pickle.dump(model, f)
with open(onet_model, "rb") as f:
    model = pickle.load(f)

### Predict each year individually

In [56]:
df = pd.DataFrame(columns=['year', 'skills', 'cluster', 
                           'cluster_centers', 'cluster_centers_x', 'cluster_centers_y', 
                           'cluster_skill_count'], dtype=object)

for i in range(0, len(skills_per_year_embeddings)):
    year=years[i]
    print("Predicting for year {}".format(year))
    cluster_assignment = model.predict(skills_per_year_embeddings[i])

    clustered_skills = {}
    for skill_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_skills:
            clustered_skills[cluster_id] = []
        clustered_skills[cluster_id].append(skills_per_year[i][skill_id])
    
    for key in sorted(clustered_skills):
        count = 0
        for skill in clustered_skills[key]:
            count += yearly_dfs[year].loc[yearly_dfs[year]['skill'] == skill, 'count'].sum()
        row = {'year': year, 'skills':', '.join(clustered_skills[key]), 'cluster':key, 
               'cluster_centers': key+1,'cluster_centers_x': model.cluster_centers_[:,0],
               'cluster_centers_y': model.cluster_centers_[:,1], 'cluster_skill_count':count}
        df = df.append(row, ignore_index=True)
        
df.year = df.year.astype(int)
df.cluster = df.cluster.astype(int)
df.cluster_skill_count = df.cluster_skill_count.astype(int)

Predicting for year 2010
Predicting for year 2011
Predicting for year 2012
Predicting for year 2013
Predicting for year 2014
Predicting for year 2015
Predicting for year 2016
Predicting for year 2017
Predicting for year 2018
Predicting for year 2019
Predicting for year 2020
Predicting for year 2021


In [57]:
df.to_csv(onet_cluster_data+".csv")

### Visualizations

In [16]:
df = pd.read_csv(onet_cluster_data+".csv")

In [17]:
fig = px.scatter(
    df,
    x='cluster_centers',
    y='cluster_centers',
    animation_frame='year',
    animation_group='cluster',
    size='cluster_skill_count',
    color='cluster',
    hover_name="cluster",
    hover_data={ 'year':False, 'cluster_centers':False, 'cluster':False, 'skills': True},
    labels={'cluster_skill_count': 'Count', 'skills': 'Skills', 'cluster_centers': "Cluster"},
    log_x=False,
    range_x=[0, number_of_clusters+1],
    range_y=[0, number_of_clusters+1],
    title=title
)
fig.update(layout_coloraxis_showscale=True)
# fig.show()
fig.write_html(onet_bubble_chart)

In [18]:
labels={'cluster_skill_count': 'Count', 'cluster': "Cluster"}
figures = [
    px.bar(df[df['year']==2010], x='cluster', y='cluster_skill_count', title='2010', labels=labels),
    px.bar(df[df['year']==2011], x='cluster', y='cluster_skill_count', title='2011', labels=labels),
    px.bar(df[df['year']==2012], x='cluster', y='cluster_skill_count', title='2012', labels=labels),
    px.bar(df[df['year']==2013], x='cluster', y='cluster_skill_count', title='2013', labels=labels),
    px.bar(df[df['year']==2014], x='cluster', y='cluster_skill_count', title='2014', labels=labels),
    px.bar(df[df['year']==2015], x='cluster', y='cluster_skill_count', title='2015', labels=labels),
    px.bar(df[df['year']==2016], x='cluster', y='cluster_skill_count', title='2016', labels=labels),
    px.bar(df[df['year']==2017], x='cluster', y='cluster_skill_count', title='2017', labels=labels),
    px.bar(df[df['year']==2018], x='cluster', y='cluster_skill_count', title='2018', labels=labels),
    px.bar(df[df['year']==2019], x='cluster', y='cluster_skill_count', title='2019', labels=labels),
    px.bar(df[df['year']==2020], x='cluster', y='cluster_skill_count', title='2020', labels=labels),
    px.bar(df[df['year']==2021], x='cluster', y='cluster_skill_count', title='2021', labels=labels)
]

fig = make_subplots(rows=6, cols=2, subplot_titles=range(2010, 2022)) 
        
for i, figure in enumerate(figures):
    if i==0:
        r, c = 1, 1
    elif i==1:
        r, c = 1, 2
    elif i==2:
        r, c = 2, 1
    elif i==3:
        r, c = 2, 2
    elif i==4:
        r, c = 3, 1
    elif i==5:
        r, c = 3, 2
    elif i==6:
        r, c = 4, 1
    elif i==7:
        r, c = 4, 2
    elif i==8:
        r, c = 5, 1
    elif i==9:
        r, c = 5, 2
    elif i==10:
        r, c = 6, 1
    elif i==11:
        r, c = 6, 2
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=r, col=c)

fig.update_layout(height=1500, width=1500, title_text=title)
fig.update_xaxes(tickmode='linear')
# fig.show()
fig.write_html(onet_bar_chart)

In [19]:
labels={'cluster_skill_count': 'Count', 'cluster': "Cluster"}

fig = px.line(df, x='year', y='cluster_skill_count', color='cluster', labels=labels)

fig.update_layout(height=500, width=1500, title_text=title)
fig.update_xaxes(tickmode='linear')
fig.write_html(onet_line_chart)