In [8]:
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from glob import glob



In [9]:
def linear_scoring():
    def adapt_fasttext(x):
        if x == 'Fasttext_Wiki':
            return 'Fasttext'
        else:
            return x
    scores = pd.read_csv('/home/sami/FLAIR/Results/LINEAR/LinearScores.csv')
    scores.Embedding = scores.Embedding.apply(lambda x : adapt_fasttext(x))
    scores.to_csv('/home/sami/FLAIR/Results/LINEAR/LinearScores.csv', index = False)
    return scores
ls = linear_scoring() 
def transformer_scoring():
    df_list = []
    for idx in range(5):
        for file in glob(f'/home/sami/FLAIR/Results/TRANSFORMER/E{idx}/*scores.csv'):
            df_list.append(pd.read_csv(file))
    transformer_scores = pd.concat(df_list)
    transformer_scores.to_csv('/home/sami/FLAIR/Results/TRANSFORMER/TransformerScores.csv', index = False)
    return transformer_scores
ts = transformer_scoring()

In [10]:
def get_experiment(experiment:str):
    df = pd.read_csv('/home/sami/FLAIR/Data/Experiment_Frame.csv')
    return df[df[experiment].notna()]
def parse_doc_type(doc_type):
    if doc_type=='SOCIAL_MEDIA':
        return 'Twitter'
    if doc_type=='PROTOKOLL':
        return 'Protocol'
    if doc_type=='NEWSPAPER':
        return 'Newspaper'
    if doc_type=='MANIFESTO':
        return 'Manifesto'
    if doc_type=='TALKSHOW':
        return 'Talkshow'

In [11]:
def add_trace_linear(df, fig, embedding, model, row, col):
    if embedding == 'BoW':
        show_legend = True
    else:
        show_legend = False
    colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#E69F00", "#56B4E9", "#009E73", "#F0E442"]
    models = ['LogReg', 'SVM','XGBoost', 'AdaBoost', 'RandomForest', 'NaiveBayes',
           'DistilBert', 'Bert-Base',  'Electra', 'GottBert']
    color_dict = {model: color for model, color in zip(models, colors)}
    emb_df = df[df.Embedding == embedding]
    model_df = emb_df[emb_df.Model == model]
    fig.add_trace(go.Scatter(x=model_df.Experiment, y=model_df.F1,
                  marker=dict(color=color_dict[model]), showlegend=show_legend, name=model),row=row, col=col)
    return fig
def add_trace_transformer(df, fig, model, row, col):
    colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#E69F00", "#56B4E9", "#009E73", "#F0E442"]
    models = ['LogReg', 'SVM','XGBoost', 'AdaBoost', 'RandomForest', 'NaiveBayes',
           'DistilBert', 'Bert',   'Electra','GottBert',]
    color_dict = {model: color for model, color in zip(models, colors)}
    model_df = df[df.Model == model]
    fig.add_trace(go.Scatter(x=model_df.Experiment, y=model_df.F1,
                  marker=dict(color=color_dict[model], line=dict(width=1,color='black')), name=model),row=row, col=col)
    return fig

In [12]:
fig = make_subplots(rows=3, 
                    cols=3, 
                    specs=[[{},{},{}], [{},{},{}], [{"colspan": 3}, None, None]],
                    horizontal_spacing=.05,
                    vertical_spacing=.03,
                    row_heights=[.3, .3, .4],
                    subplot_titles=("BoW", "Tf-idf", "Word2Vec", "GloVe", "Fasttext", "Sentence Transformer", "Transformer Models"),
                   )
fig.update_layout(template='plotly_white')
df = pd.read_csv('/home/sami/FLAIR/Results/LINEAR/LinearScores.csv').sort_values('Experiment')
embedding = 'BoW'
row = 1
col = 1
model = 'LogReg'
fig = add_trace_linear(df, fig, embedding, model, row=1, col=1)
model = 'SVM'
fig = add_trace_linear(df, fig, embedding, model, row=1, col=1)
model = 'XGBoost'
fig = add_trace_linear(df, fig, embedding, model, row=1, col=1)
model = 'AdaBoost'
fig = add_trace_linear(df, fig, embedding, model, row=1, col=1)
model = 'RandomForest'
fig = add_trace_linear(df, fig, embedding, model, row=1, col=1)
model = 'NaiveBayes'
fig = add_trace_linear(df, fig, embedding, model, row=1, col=1)

embedding = 'Tf-idf'
row = 1
col = 2
model = 'LogReg'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'SVM'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'XGBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'AdaBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'RandomForest'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'NaiveBayes'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)

embedding = 'Word2Vec'
row = 1
col = 3
model = 'LogReg'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'SVM'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'XGBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'AdaBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'RandomForest'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'NaiveBayes'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)

embedding = 'GloVe'
row = 2
col = 1
model = 'LogReg'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'SVM'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'XGBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'AdaBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'RandomForest'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'NaiveBayes'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)

embedding = 'Fasttext'
row = 2
col = 2
model = 'LogReg'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'SVM'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'XGBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'AdaBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'RandomForest'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'NaiveBayes'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)

embedding = 'SentEmbeddings'
row = 2
col = 3
model = 'LogReg'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'SVM'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'XGBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'AdaBoost'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'RandomForest'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)
model = 'NaiveBayes'
fig = add_trace_linear(df, fig, embedding, model, row=row, col=col)

row = 3
col = 1
model = 'DistilBert'
df = pd.read_csv('/home/sami/FLAIR/Results/TRANSFORMER/TransformerScores.csv')
fig = add_trace_transformer(df, fig, model, row=row, col=col)
model = 'Bert'
fig = add_trace_transformer(df, fig, model, row=row, col=col)
model = 'GottBert'
fig = add_trace_transformer(df, fig, model, row=row, col=col)
model = 'Electra'
fig = add_trace_transformer(df, fig, model, row=row, col=col)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(title_text='F1', row=1, col=1)
fig.update_yaxes(title_text='F1', row=2, col=1)
fig.update_yaxes(title_text='F1', row=3, col=1)
fig.update_yaxes(nticks=12, row=row, col=col)
fig.update_xaxes(showticklabels=True, row=row, col=col)
pio.write_image(fig, '/home/sami/FLAIR/Visuals/PNG/Scores.png',scale=10, width=1080, height=800)
pio.write_image(fig, '/home/sami/FLAIR/Visuals/SVG/Scores.svg',scale=10, width=1080, height=800)
pio.write_image(fig, '/home/sami/FLAIR/Visuals/JPG/Scores.jpg',scale=10, width=1080, height=800)
pio.write_image(fig, '/home/sami/FLAIR/Visuals/PDF/Scores.pdf',scale=10, width=1080, height=800)
fig.write_html("/home/sami/FLAIR/Visuals/HTML/Scores.html")
fig.show()