In [None]:
import plotly
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots

%matplotlib inline 
pio.renderers.default='iframe'

In [None]:
def draw_graph(df, title, y_axis = 'cluster_skill_count'):
    labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
    fig = px.line(df, x='year', y=y_axis, color='cluster', labels=labels, width=800, height=400)
    fig.update_layout(height=400, width=900, title_text=title, title_x=0.5, legend=dict(orientation="h"), legend_title="",)
    fig.update_xaxes(tickmode='linear')
    fig.update_traces(line=dict(width=4))
    return fig

In [None]:
def draw_subplots(df, title):
    fig = make_subplots(rows=6, cols=2, subplot_titles=range(2010, 2021))
    fig.add_trace(go.Bar(x=df[df['year']==2010]['cluster'], y=df[df['year']==2010]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 1, 1)
    fig.add_trace(go.Bar(x=df[df['year']==2011]['cluster'], y=df[df['year']==2011]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 1, 2)
    fig.add_trace(go.Bar(x=df[df['year']==2012]['cluster'], y=df[df['year']==2012]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 2, 1)
    fig.add_trace(go.Bar(x=df[df['year']==2013]['cluster'], y=df[df['year']==2013]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 2, 2)
    fig.add_trace(go.Bar(x=df[df['year']==2014]['cluster'], y=df[df['year']==2014]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 3, 1)
    fig.add_trace(go.Bar(x=df[df['year']==2015]['cluster'], y=df[df['year']==2015]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 3, 2)
    fig.add_trace(go.Bar(x=df[df['year']==2016]['cluster'], y=df[df['year']==2016]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 4, 1)
    fig.add_trace(go.Bar(x=df[df['year']==2017]['cluster'], y=df[df['year']==2017]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 4, 2)
    fig.add_trace(go.Bar(x=df[df['year']==2018]['cluster'], y=df[df['year']==2018]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 5, 1)
    fig.add_trace(go.Bar(x=df[df['year']==2019]['cluster'], y=df[df['year']==2019]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 5, 2)
    fig.add_trace(go.Bar(x=df[df['year']==2020]['cluster'], y=df[df['year']==2020]['percentage'], 
                         marker=dict(color=[1, 2, 3, 4, 5, 6, 7])), 6, 1)

    fig.update_yaxes(range = [0, 100], tick0 = 0, dtick = 20)
    fig.update_layout(height=1500, width=1000, title_text=f"Trending Analysis for {title}",
                      template="plotly_white", title_x=0.5, showlegend=False)
    return fig

# Combinations of 9, 12, 16

In [None]:
# cluster 9, 12, 16
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Trend of Computational Skills")

In [None]:
# cluster 9, 12
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 9 : 'Computational Skills', 12 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Trend of Computational Skills")

In [None]:
# cluster 12, 16
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Trend of Computational Skills")

In [None]:
# cluster 9, 16
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Trend of Computational Skills")

# Cluster 5 vs Combinations of 9, 12, 16

In [None]:
# cluster 5 vs 9, 12, 16 - original
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 12, 16 - normalized
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

new_df = pd.DataFrame()
min_val = df['cluster_skill_count'].min()
max_val = df['cluster_skill_count'].max()
clusters = df['cluster'].unique().tolist()
for cluster in clusters:
    temp_df = df[df['cluster']==cluster]
    temp_df['normalized_score'] = (temp_df['cluster_skill_count'] - min_val)/(max_val-min_val)
    new_df = pd.concat([new_df,temp_df])
df = new_df.groupby(["year", "cluster"])['normalized_score'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Noramlized Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 12, 16 - log scale
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df['natural_log'] = np.log(df['cluster_skill_count'])
# df['log_base_2'] = np.log2(df['cluster_skill_count'])
# df['log_base_10'] = np.log10(df['cluster_skill_count'])
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills", "natural_log")

>per skill count for an year / count of all skill for a year = percentage of the skill occuring in that year$


In [None]:
# cluster 5 vs 9, 12, 16 - percentage
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear')
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))

In [None]:
# cluster 5 vs 9, 12 - original
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills'} 
df = df.loc[df['cluster'].isin([5, 9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 12 - normalized
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills'} 
df = df.loc[df['cluster'].isin([5, 9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

new_df = pd.DataFrame()
clusters = df['cluster'].unique().tolist()
for cluster in clusters:
    temp_df = df[df['cluster']==cluster]
    min_val = temp_df['cluster_skill_count'].min()
    max_val = temp_df['cluster_skill_count'].max()
    temp_df['normalized_score'] = (temp_df['cluster_skill_count'] - min_val)/(max_val-min_val)
    new_df = pd.concat([new_df,temp_df])
df = new_df.groupby(["year", "cluster"])['normalized_score'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Noramlized Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 12 - log scale
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills'} 
df = df.loc[df['cluster'].isin([5, 9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df['natural_log'] = np.log(df['cluster_skill_count'])
# df['log_base_2'] = np.log2(df['cluster_skill_count'])
# df['log_base_10'] = np.log10(df['cluster_skill_count'])
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills", "natural_log")

In [None]:
# cluster 5 vs 9, 12 - percentage
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills'} 
df = df.loc[df['cluster'].isin([5, 9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear')
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))

In [None]:
# cluster 5 vs 9 16 - original
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9 16 - normalized
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

new_df = pd.DataFrame()
clusters = df['cluster'].unique().tolist()
for cluster in clusters:
    temp_df = df[df['cluster']==cluster]
    min_val = temp_df['cluster_skill_count'].min()
    max_val = temp_df['cluster_skill_count'].max()
    temp_df['normalized_score'] = (temp_df['cluster_skill_count'] - min_val)/(max_val-min_val)
    new_df = pd.concat([new_df,temp_df])
df = new_df.groupby(["year", "cluster"])['normalized_score'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Noramlized Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 16 - log scale
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df['natural_log'] = np.log(df['cluster_skill_count'])
# df['log_base_2'] = np.log2(df['cluster_skill_count'])
# df['log_base_10'] = np.log10(df['cluster_skill_count'])
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills", "natural_log")

In [None]:
# cluster 5 vs 9, 16 - percentage
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear')
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))

In [None]:
# cluster 5 vs 12 16 - original
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 12 16 - normalized
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

new_df = pd.DataFrame()
clusters = df['cluster'].unique().tolist()
for cluster in clusters:
    temp_df = df[df['cluster']==cluster]
    min_val = temp_df['cluster_skill_count'].min()
    max_val = temp_df['cluster_skill_count'].max()
    temp_df['normalized_score'] = (temp_df['cluster_skill_count'] - min_val)/(max_val-min_val)
    new_df = pd.concat([new_df,temp_df])
df = new_df.groupby(["year", "cluster"])['normalized_score'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Noramlized Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 12, 16 - log scale
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df['natural_log'] = np.log(df['cluster_skill_count'])
# df['log_base_2'] = np.log2(df['cluster_skill_count'])
# df['log_base_10'] = np.log10(df['cluster_skill_count'])
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills", "natural_log")

In [None]:
# cluster 5 vs 12, 16 - percentage
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear')
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))

# Final Visualizations

## Percentage Comparison of Cluster 5 vs 9, 12, 15, 16

In [None]:
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 15 : 'Computational Skills', 16 : 'Computational Skills' } 

df = df.loc[df['cluster'].isin([5, 9, 12, 15, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'year': 'Year<br>', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear', range = [2009.5, 2020.5])
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))

## ONET 27 (Arts and Media): Cluster 1 - Marketing vs Cluster 5 - Journalism Skills vs Cluster 6 - Industry Expertise vs Cluster 9 - Web Skills vs Cluster 12 - Programming vs Cluster 15 - Data Science vs Cluster 16 - Analytics

> Notes:
- Each corresponding cluster across the x axis is a different color
- Has axis labels
- Title should be: Trending Analysis for < category >

In [None]:
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 1: 'Marketing', 5 : 'Journalism Skills', 6 : 'Industry Expertise', 9 : 'Web Skills', 
                  12 : 'Programming', 15 : 'Data Science', 16 : 'Analytics' }

df = df.loc[df['cluster'].isin([1, 5, 6, 9, 12, 15, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100
df = df.sort_values(by=['year', 'cluster'], ascending=True)

draw_subplots(df, "Arts and Media")

## ONET 15 (Computer Related Occupations) : Cluster 1 - Computational Skills vs Cluster 3 - Advertising vs Cluster 7 - Office Management vs Cluster 15 - Automation Skills

In [None]:
df = pd.read_csv("15/15_cluster_data.csv", index_col=0)
replace_values = { 1: 'Computational Skills', 3 : 'Advertising', 7 : 'Office Management', 15 : 'Automation Skills' }

df = df.loc[df['cluster'].isin([1, 3, 7, 15])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100
df = df.sort_values(by=['year', 'cluster'], ascending=True)

draw_subplots(df, "Computer Related Occupations")

## ONET 13 (Business Specialists) : Cluster 3: Sales vs Cluster 4: Computational Skills vs Cluster 15: Marketing

In [None]:
df = pd.read_csv("13/13_cluster_data.csv", index_col=0)
replace_values = { 3: 'Sales', 4 : 'Computational Skills', 15 : 'Marketing' }

df = df.loc[df['cluster'].isin([3, 4, 15])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100
df = df.sort_values(by=['year', 'cluster'], ascending=True)

draw_subplots(df, "Business Specialists")

## ONET 41: (Sales and Marketing) : Cluster 5: Leadership vs Cluster 6: Management vs Cluster 7: Cloud Computing vs Cluster 9: Marketing vs Cluster 10: Social Media vs Cluster 14: Computational Skills vs Cluster 16: Advertising vs Cluster 18: Media and Sales

In [None]:
df = pd.read_csv("41/41_cluster_data.csv", index_col=0)
replace_values = { 5: 'Leadership', 6 : 'Management', 7 : 'Cloud Computing', 9 : 'Marketing', 
                  10 : 'Social Media', 14 : 'Computational Skills', 16 : 'Advertising', 18 : 'Media and Sales' }

df = df.loc[df['cluster'].isin([5, 6, 7, 9, 10, 14, 16, 18])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

df = pd.merge(df, pd.read_csv("total_skill_count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100
df = df.sort_values(by=['year', 'cluster'], ascending=True)

draw_subplots(df, "Sales and Marketing")