In [None]:
from trainer import Trainer

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
import matplotlib.pyplot as plt

# Setup

In [None]:
path = "./data/iliad_sentiments.csv"
target = "class"

In [None]:
scenarios = {
    "TF-IDF sem lematização": {"lemmatize": False, "sbert": False},
    "TF-IDF com lematização": {"lemmatize": True, "sbert": False},
    "SBERT com lematização": {"lemmatize": False, "sbert": True},
           }
'''
scenarios = {
    "TF-IDF sem lematização": {"lemmatize": False, "sbert": False},
    "TF-IDF com lematização": {"lemmatize": True, "sbert": False}
           }
'''
classifiers = {
    "Decision Tree": DecisionTreeClassifier(), 
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier()
}

STANDARD = "Sem modificação"
WITHOUT_NARRATOR = "Sem narrador"
MERGE_NARRATOR_NEUTRAL = "Junção de narrador e neutro"

data_operations = [STANDARD, WITHOUT_NARRATOR, MERGE_NARRATOR_NEUTRAL]

nro_folds = 5

# Trainer

In [None]:
'''metrics = {}

for operation in data_operations:
    metrics[operation] = {}

for scenario in scenarios:
    for operation in data_operations:
        trainer = Trainer(path, target)
        
        if operation == STANDARD:
            trainer.prepare_data_to_train(lemmatize=scenarios[scenario]["lemmatize"], sbert=scenarios[scenario]["sbert"])
        if operation == WITHOUT_NARRATOR:
            trainer.remove_class(["narrator"])
            trainer.prepare_data_to_train(lemmatize=scenarios[scenario]["lemmatize"], sbert=scenarios[scenario]["sbert"])
        elif operation == MERGE_NARRATOR_NEUTRAL:
            trainer.merge_class(["narrator"], "neutral")
            trainer.prepare_data_to_train(lemmatize=scenarios[scenario]["lemmatize"], sbert=scenarios[scenario]["sbert"])
    
        for classifier in classifiers:
            metrics[operation][classifier] = {}

        for classifier in classifiers:
            metrics[operation][classifier][scenario] = trainer.cross_validation(model=classifiers[classifier], nro_folds=nro_folds)'''

In [None]:
metrics = {}

for classifier in classifiers:  
    metrics[classifier] = {}
        
    for scenario in scenarios: 
        metrics[classifier][scenario] = {}

for classifier in classifiers:       
    for scenario in scenarios: 
        for operation in data_operations:
            trainer = Trainer(path, target)

            if operation == STANDARD:
                trainer.prepare_data_to_train(lemmatize=scenarios[scenario]["lemmatize"], sbert=scenarios[scenario]["sbert"])
            if operation == WITHOUT_NARRATOR:
                trainer.remove_class(["narrator"])
                trainer.prepare_data_to_train(lemmatize=scenarios[scenario]["lemmatize"], sbert=scenarios[scenario]["sbert"])
            elif operation == MERGE_NARRATOR_NEUTRAL:
                trainer.merge_class(["narrator"], "neutral")
                trainer.prepare_data_to_train(lemmatize=scenarios[scenario]["lemmatize"], sbert=scenarios[scenario]["sbert"])
                
            metrics[classifier][scenario][operation] = trainer.cross_validation(model=classifiers[classifier], nro_folds=nro_folds)
            print(f"{classifier} : {scenario} : {operation}")
            print(metrics[classifier][scenario][operation])
            print("=======================")


In [None]:
import matplotlib.pyplot as plt

def compare_boxplot(dict_metrics, labels, width=22, height=10):
    plots = []
    result = {}
    
    for model in dict_metrics:
        result[model] = {}
    
    #plot
    for model in dict_metrics:
        #line
        for scenario in dict_metrics[model]:
            if scenario not in result[model]:
                result[model][scenario] = {}
            #Values
            for data_operation in dict_metrics[model][scenario]:
                for metric in dict_metrics[model][scenario][data_operation]:
                    if metric not in result[model][scenario]:
                        result[model][scenario][metric] = []
                        result[model][scenario][metric].append(dict_metrics[model][scenario][data_operation][metric])
                    else:
                        result[model][scenario][metric].append(dict_metrics[model][scenario][data_operation][metric])
    
    for title in result:
        number_type_trainers = len(result[title])

        firts_key = list(result[title].keys())[0]
        number_metrics = len(result[title][firts_key])

        print(number_type_trainers, number_metrics)
        
        fig, ax = plt.subplots(number_type_trainers, number_metrics, figsize=(width, height))
        print(ax.shape)
        fig.suptitle(title)
        print(title)

        scenarios = result[title]

        for idx_scenario, scenario in enumerate(scenarios):
            metrics = scenarios[scenario]
            for idx_metric, metric in enumerate(metrics):
                results=[]

                for values in metrics[metric]:
                    results.append(values)

                #ax[idx_scenario][idx_metric].set_ylim(bottom=0, top=100)
                print(idx_scenario, idx_metric, results)
                ax[idx_scenario][idx_metric].boxplot(results, labels=labels, showmeans=True)
        
                ax[idx_scenario][idx_metric].set_title(metric)


In [None]:
compare_boxplot(metrics, data_operations)

# Visualizations 

In [None]:
from visualizations import compare_boxplot

In [None]:
compare_boxplot(metrics, labels)

In [None]:


compare_boxplot()