In [None]:
import plotly
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px

%matplotlib inline 
pio.renderers.default='iframe'

In [None]:
def draw_graph(df, title, y_axis = 'cluster_skill_count'):
    labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
    fig = px.line(df, x='year', y=y_axis, color='cluster', labels=labels, width=800, height=400)
    fig.update_layout(height=400, width=900, title_text=title, title_x=0.5, legend=dict(orientation="h"), legend_title="",)
    fig.update_xaxes(tickmode='linear')
    fig.update_traces(line=dict(width=4))
    return fig

# Combinations of 9, 12, 16

In [None]:
# cluster 9, 12, 16
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Trend of Computational Skills")

In [None]:
# cluster 9, 12
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 9 : 'Computational Skills', 12 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Trend of Computational Skills")

In [None]:
# cluster 12, 16
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Trend of Computational Skills")

In [None]:
# cluster 9, 16
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Trend of Computational Skills")

# Cluster 5 vs Combinations of 9, 12, 16

In [None]:
# cluster 5 vs 9, 12, 16 - original
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 12, 16 - normalized
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

new_df = pd.DataFrame()
min_val = df['cluster_skill_count'].min()
max_val = df['cluster_skill_count'].max()
clusters = df['cluster'].unique().tolist()
for cluster in clusters:
    temp_df = df[df['cluster']==cluster]
    temp_df['normalized_score'] = (temp_df['cluster_skill_count'] - min_val)/(max_val-min_val)
    new_df = pd.concat([new_df,temp_df])
df = new_df.groupby(["year", "cluster"])['normalized_score'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Noramlized Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 12, 16 - log scale
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df['natural_log'] = np.log(df['cluster_skill_count'])
# df['log_base_2'] = np.log2(df['cluster_skill_count'])
# df['log_base_10'] = np.log10(df['cluster_skill_count'])
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills", "natural_log")

per skill count for an year / count of all skill for a year = percentage of the skill occuring in that year$


In [None]:
# cluster 5 vs 9, 12, 16 - percentage
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df = pd.merge(df, pd.read_csv("count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear')
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))

In [None]:
# cluster 5 vs 9, 12 - original
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills'} 
df = df.loc[df['cluster'].isin([5, 9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 12 - normalized
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills'} 
df = df.loc[df['cluster'].isin([5, 9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

new_df = pd.DataFrame()
clusters = df['cluster'].unique().tolist()
for cluster in clusters:
    temp_df = df[df['cluster']==cluster]
    min_val = temp_df['cluster_skill_count'].min()
    max_val = temp_df['cluster_skill_count'].max()
    temp_df['normalized_score'] = (temp_df['cluster_skill_count'] - min_val)/(max_val-min_val)
    new_df = pd.concat([new_df,temp_df])
df = new_df.groupby(["year", "cluster"])['normalized_score'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Noramlized Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 12 - log scale
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills'} 
df = df.loc[df['cluster'].isin([5, 9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df['natural_log'] = np.log(df['cluster_skill_count'])
# df['log_base_2'] = np.log2(df['cluster_skill_count'])
# df['log_base_10'] = np.log10(df['cluster_skill_count'])
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills", "natural_log")

In [None]:
# cluster 5 vs 9, 12 - percentage
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 12 : 'Computational Skills'} 
df = df.loc[df['cluster'].isin([5, 9, 12])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df = pd.merge(df, pd.read_csv("count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear')
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))

In [None]:
# cluster 5 vs 9 16 - original
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9 16 - normalized
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

new_df = pd.DataFrame()
clusters = df['cluster'].unique().tolist()
for cluster in clusters:
    temp_df = df[df['cluster']==cluster]
    min_val = temp_df['cluster_skill_count'].min()
    max_val = temp_df['cluster_skill_count'].max()
    temp_df['normalized_score'] = (temp_df['cluster_skill_count'] - min_val)/(max_val-min_val)
    new_df = pd.concat([new_df,temp_df])
df = new_df.groupby(["year", "cluster"])['normalized_score'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Noramlized Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 9, 16 - log scale
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df['natural_log'] = np.log(df['cluster_skill_count'])
# df['log_base_2'] = np.log2(df['cluster_skill_count'])
# df['log_base_10'] = np.log10(df['cluster_skill_count'])
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills", "natural_log")

In [None]:
# cluster 5 vs 9, 16 - percentage
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 9 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 9, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df = pd.merge(df, pd.read_csv("count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear')
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))

In [None]:
# cluster 5 vs 12 16 - original
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 12 16 - normalized
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)

new_df = pd.DataFrame()
clusters = df['cluster'].unique().tolist()
for cluster in clusters:
    temp_df = df[df['cluster']==cluster]
    min_val = temp_df['cluster_skill_count'].min()
    max_val = temp_df['cluster_skill_count'].max()
    temp_df['normalized_score'] = (temp_df['cluster_skill_count'] - min_val)/(max_val-min_val)
    new_df = pd.concat([new_df,temp_df])
df = new_df.groupby(["year", "cluster"])['normalized_score'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
draw_graph(df, "Comparison of the Noramlized Count of Journalism Skills vs. Computational Skills")

In [None]:
# cluster 5 vs 12, 16 - log scale
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df['natural_log'] = np.log(df['cluster_skill_count'])
# df['log_base_2'] = np.log2(df['cluster_skill_count'])
# df['log_base_10'] = np.log10(df['cluster_skill_count'])
draw_graph(df, "Comparison of the Count of Journalism Skills vs. Computational Skills", "natural_log")

In [None]:
# cluster 5 vs 12, 16 - percentage
df = pd.read_csv("27/27_cluster_data.csv", index_col=0)
replace_values = { 5 : 'Journalism Skills', 12 : 'Computational Skills', 16 : 'Computational Skills' } 
df = df.loc[df['cluster'].isin([5, 12, 16])]
df = df.replace({"cluster": replace_values})
df = df.groupby( [ "year", "cluster"])['cluster_skill_count'].sum().to_frame(name = 'cluster_skill_count').reset_index().sort_values(by='year', ascending=False)
df = pd.merge(df, pd.read_csv("count_per_year.csv"), on=['year'])         
df['percentage'] = (df['cluster_skill_count']/df['count'])*100

labels={'cluster_skill_count': 'Count', 'cluster': 'Cluster', 'year': 'Year', 
            'natural_log': 'ln(Count)', 'percentage': 'Percentage per Year'}
fig = px.line(df, x='year', y='percentage', color='cluster', labels=labels, width=800, height=400)
fig.update_layout(height=400, width=900, title_text="Comparison of the Count of Journalism Skills vs. Computational Skills", 
                  title_x=0.5, legend=dict(orientation="h"), legend_title="", template="plotly_white")
fig.update_xaxes(tickmode='linear')
fig.update_yaxes(range = [0, 100])
fig.update_traces(line=dict(width=4))