# Packages:

In [1]:
try:
  from google.colab import drive
  !nvidia-smi
  drive.mount('/content/drive')
  path = 'drive/MyDrive/Thesis/'
except:
  path = './'

/bin/bash: nvidia-smi: command not found
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Packages for loading data:
from os import walk
import itertools
import json
import re
import pickle

# Packages for effective data storage / math utils:
import pandas as pd
import numpy as np

# Packages for plotting:
import seaborn as sns
import matplotlib.pyplot as plt

# Packages for modeling:
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Packages for performance:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# Misc.:
import time
import multiprocessing

seed = 101
cores = multiprocessing.cpu_count()

## Model Evaluation:

In [3]:
def vec_path_getter(
    vecpath : str,
    contains : str,
):
    #########
    # Input:
    # Output: list of all paths to jasons to be used later
    #########

    filenames = next(walk(vecpath), (None, None, []))[2]
    filenames = [str(vecpath + "/" + file) for file in filenames
                 if contains in file]

    return(filenames)

# Load model, with correct test dataset -> predict on test -> return confusion matrix
vec_x_paths = vec_path_getter(
    path + "ECHR_Dataset_vec",
    "x"
    )
pattern = r'(?<=vec/)(.*?)(?=_test_x.parquet.gzip|_train_x.parquet.gzip)'
unique_datasets = list(set([re.search(pattern, string).group(0) for string in vec_x_paths]))
vec_x_paths = [[x for x in vec_x_paths if str(i + "_train") in x or str(i + "_test") in x] for i in unique_datasets]
vec_x_paths = [
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_lda_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_lda_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_tsvd_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_tsvd_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_lda_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_lda_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_tsvd_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_tsvd_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/w2v_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/w2v_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/d2v_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/d2v_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/glove_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/glove_test_x.parquet.gzip'],
    ]

y_paths = vec_path_getter(
    path + "ECHR_Dataset_clean",
    "y"
    )
y_paths.sort()

classical_models = [
    LogisticRegression,
    GaussianNB,
    SVC,
    DecisionTreeClassifier,
    RandomForestClassifier,
    LGBMClassifier,
    KNeighborsClassifier,
    Perceptron,
    MLPClassifier,
    ]

In [4]:
def run_eval_on_models(models, datasets_paths):
    y_test =  pd.read_pickle([i for i in y_paths if 'test'  in i][0])
    pattern = r'(?<=vec/)(.*?)(?=_test_x.parquet.gzip|_train_x.parquet.gzip)'

    # Create list of DataFrames to store the results:
    accuracy_dfs = []
    f1_dfs = []
    params_dfs = []

    for i, model in enumerate(models):
        # Create temporary dictionaries to store the results for each dataset:
        accuracy_dict = {}
        f1_dict = {}
        params_dict = {}
        for j, datasets in enumerate(datasets_paths):
            # Set up:
            datasets.sort()
            temp_model_name = str(model).split('.')[-1].split("'")[0]
            temp_data_name = re.search(pattern, datasets[0]).group(0)
            print(f"Assesing model {temp_model_name} on dataset {temp_data_name}")

            # Read correct data:
            #x_train = pd.read_parquet(datasets[1])
            x_test = pd.read_parquet(datasets[0])

            # Load best model - if it exists:
            try:
                filename = f"{path}ECHR_model/model_{temp_model_name}__dataset_{temp_data_name}.pkl"
                loaded_model = pickle.load(open(filename, 'rb'))
                y_pred = loaded_model.predict(x_test)
                params = loaded_model.get_params()
                accuracy = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average='macro')
            except:
                params = np.nan
                accuracy = np.nan
                f1 = np.nan

            # Store the results in the dictionaries:
            accuracy_dict[temp_data_name] = accuracy
            f1_dict[temp_data_name] = f1
            params_dict[temp_data_name] = params

        # Add the results for this model to the DataFrames:
        accuracy_dfs.append(pd.Series(accuracy_dict, name=temp_model_name))
        f1_dfs.append(pd.Series(f1_dict, name=temp_model_name))
        params_dfs.append(pd.Series(params_dict, name=temp_model_name))


    # Concatenate all the DataFrames in the list:
    accuracy_df = pd.concat(accuracy_dfs, axis=1)
    f1_df = pd.concat(f1_dfs, axis=1)
    params_df = pd.concat(params_dfs, axis=1)

    # Return the resulting DataFrames:
    return accuracy_df, f1_df, params_df

In [5]:
accuracy_df, f1_df, params_df = run_eval_on_models(classical_models, vec_x_paths)

Assesing model LogisticRegression on dataset bow_uni_lda
Assesing model LogisticRegression on dataset bow_bi_lda
Assesing model LogisticRegression on dataset bow_uni_tsvd
Assesing model LogisticRegression on dataset bow_bi_tsvd
Assesing model LogisticRegression on dataset tfidf_uni_lda
Assesing model LogisticRegression on dataset tfidf_bi_lda
Assesing model LogisticRegression on dataset tfidf_uni_tsvd
Assesing model LogisticRegression on dataset tfidf_bi_tsvd
Assesing model LogisticRegression on dataset w2v
Assesing model LogisticRegression on dataset d2v
Assesing model LogisticRegression on dataset glove
Assesing model GaussianNB on dataset bow_uni_lda
Assesing model GaussianNB on dataset bow_bi_lda
Assesing model GaussianNB on dataset bow_uni_tsvd
Assesing model GaussianNB on dataset bow_bi_tsvd
Assesing model GaussianNB on dataset tfidf_uni_lda
Assesing model GaussianNB on dataset tfidf_bi_lda
Assesing model GaussianNB on dataset tfidf_uni_tsvd
Assesing model GaussianNB on dataset t

In [6]:
accuracy_df = round(accuracy_df, 4)

In [17]:
np.array(accuracy_df.MLPClassifier)

array([0.6533, 0.6618, 0.8033, 0.8103, 0.559 , 0.5845,    nan, 0.8274,
       0.7837, 0.7707, 0.5083])

In [None]:
f1_df #round(f1_df, 2).to_latex()

Unnamed: 0,LogisticRegression,GaussianNB,SVC,DecisionTreeClassifier,RandomForestClassifier,LGBMClassifier,KNeighborsClassifier,Perceptron,MLPClassifier
bow_uni_lda,0.629107,0.606237,0.64808,0.606118,0.680722,0.677392,0.656768,0.391938,0.653132
bow_bi_lda,0.632657,0.61502,0.644406,0.628494,0.693356,0.675454,0.657628,0.3416,0.661806
bow_uni_tsvd,0.799799,0.483662,,0.604563,0.736363,0.807273,,0.758577,0.803311
bow_bi_tsvd,0.815802,0.477966,,0.580973,,,,,
tfidf_uni_lda,0.490672,0.475455,0.441789,0.54079,0.54509,0.555754,0.552757,0.329633,0.537633
tfidf_bi_lda,0.355211,0.355211,0.354404,0.568615,0.572215,0.580776,0.565721,0.336993,0.583678
tfidf_uni_tsvd,0.804773,0.637258,0.793617,0.620462,0.705967,0.758041,0.704351,0.777175,
tfidf_bi_tsvd,0.82975,0.620551,0.821846,0.594429,,0.769671,,,
w2v,0.754122,0.639037,0.729441,0.649842,0.746471,0.77614,0.719969,0.726031,0.783153
d2v,0.733562,0.615813,0.778111,0.575373,0.724648,0.760647,0.676388,0.659547,0.770446


In [None]:
def run_eval_on_models2(models, datasets_paths, req_params):
    pattern = r'(?<=vec/)(.*?)(?=_test_x.parquet.gzip|_train_x.parquet.gzip)'

    for i, model in enumerate(models):
        for j, datasets in enumerate(datasets_paths):
            datasets.sort()
            temp_model_name = str(model).split('.')[-1].split("'")[0]
            temp_data_name = re.search(pattern, datasets[0]).group(0)
            print(f"Assesing model {temp_model_name} on dataset {temp_data_name}")

            # Load best model - if it exists:
            try:
                filename = f"{path}ECHR_model/model_{temp_model_name}__dataset_{temp_data_name}.pkl"
                loaded_model = pickle.load(open(filename, 'rb'))
                params = loaded_model.get_params()
                params = {k: params[k] for k in req_params}
            except:
                params = np.nan

            print(params)
            print()

In [None]:
params = [
    # { # LogisticRegression
    # 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    # 'penalty': [None, 'l1', 'l2'],
    # 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    # },
    # { # GaussianNB
    # 'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
    # },
    # { # SVC
    # 'C': [0.001, 0.01, 0.1, 1, 10],
    # 'gamma': [1, 0.1, 0.01, 0.001],
    # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    # },
    # { # DecisionTreeClassifier
    # 'criterion': ['gini', 'entropy', 'log_loss'],
    # 'max_depth': np.arange(10, 50, 2),
    # 'splitter': ['best', 'random'],
    # 'ccp_alpha': np.arange(0, 0.2, 0.01),
    # },
    # { # RandomForestClassifier
    # 'n_estimators': np.arange(10, 200, 10),
    # 'criterion': ['gini', 'entropy', 'log_loss'],
    # 'max_depth': np.arange(10, 50, 2),
    # 'ccp_alpha': np.arange(0, 0.2, 0.01),
    # },
    # { # LGBMClassifier
    # 'boosting_type': ['gbdt', 'dart', 'goss'],
    # 'num_leaves': [10, 50, 100, 200],
    # 'max_depth': [5, 10, 15, 20, 50],
    # 'learning_rate': [0.01, 0.05, 0.1, 0.2],
    # 'n_estimators': [100, 200, 500, 1000],
    # 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    # 'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    # },
    # { # KNeighborsClassifier
    # 'n_neighbors': np.arange(10, 50, 2),
    # 'weights': ['uniform', 'distance'],
    # 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    # 'p': [1, 2, 3]
    # },
    # { # Perceptron
    # 'penalty': [None, 'l1', 'l2', 'elasticnet'],
    # 'alpha': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
    # 'max_iter': np.arange(1000, 5000, 1000),
    # },
    { # MLPClassifier
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    }
    ][0]

In [None]:
classical_models = [
    # LogisticRegression,
    # GaussianNB,
    # SVC,
    # DecisionTreeClassifier,
    # RandomForestClassifier,
    # LGBMClassifier,
    # KNeighborsClassifier,
    # Perceptron,
    MLPClassifier,
    ]
run_eval_on_models2(classical_models, vec_x_paths, list(params.keys()))

Assesing model MLPClassifier on dataset bow_uni_lda
{'hidden_layer_sizes': (50, 50, 50), 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.001, 'learning_rate': 'invscaling'}

Assesing model MLPClassifier on dataset bow_bi_lda
{'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'adam', 'alpha': 1e-05, 'learning_rate': 'adaptive'}

Assesing model MLPClassifier on dataset bow_uni_tsvd
{'hidden_layer_sizes': (50, 100, 50), 'activation': 'identity', 'solver': 'sgd', 'alpha': 0.001, 'learning_rate': 'adaptive'}

Assesing model MLPClassifier on dataset bow_bi_tsvd
{'hidden_layer_sizes': (50, 100, 50), 'activation': 'identity', 'solver': 'sgd', 'alpha': 0.001, 'learning_rate': 'adaptive'}

Assesing model MLPClassifier on dataset tfidf_uni_lda
{'hidden_layer_sizes': (50, 50, 50), 'activation': 'tanh', 'solver': 'lbfgs', 'alpha': 1e-05, 'learning_rate': 'invscaling'}

Assesing model MLPClassifier on dataset tfidf_bi_lda
{'hidden_layer_sizes': (50, 100, 50), 'activation': 'relu', 'so

## Size Evaluation

In [None]:
vec_x_paths = [
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_lda_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_lda_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_lda_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_lda_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_tsvd_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_tsvd_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_tsvd_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_tsvd_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/w2v_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/w2v_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/d2v_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/d2v_test_x.parquet.gzip'],
    ['drive/MyDrive/Thesis/ECHR_Dataset_vec/glove_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/glove_test_x.parquet.gzip'],
    ]

In [None]:
def run_size_eval(datasets_paths):
    pattern = r'(?<=vec/)(.*?)(?=_test_x.parquet.gzip|_train_x.parquet.gzip)'

    for j, datasets in enumerate(datasets_paths):
        datasets.sort()
        temp_data_name = re.search(pattern, datasets[0]).group(0)
        print(f"Assesing dataset {temp_data_name}")

        # Read correct data:
        x_train = pd.read_parquet(datasets[1])
        x_test = pd.read_parquet(datasets[0])

        x_df = pd.concat([x_train, x_test], axis=0)
        print(f"{x_df.shape[1]} columns")
        print(f"{x_df.memory_usage(deep=True).sum()/1000000} MB")
        print()

In [None]:
run_size_eval(vec_x_paths)

Assesing dataset tfidf_uni
35475 columns
2827.862912 MB

Assesing dataset bow_uni
35475 columns
707.025512 MB

Assesing dataset bow_uni_lda
35 columns
2.869632 MB

Assesing dataset bow_bi_lda
35 columns
2.869632 MB

Assesing dataset tfidf_uni_lda
5 columns
0.278992 MB

Assesing dataset tfidf_bi_lda
5 columns
0.278992 MB

Assesing dataset tfidf_uni_tsvd
1940 columns
77.400352 MB

Assesing dataset tfidf_bi_tsvd
3000 columns
119.647712 MB

Assesing dataset bow_uni_tsvd
700 columns
27.978912 MB

Assesing dataset bow_bi_tsvd
505 columns
20.206992 MB

Assesing dataset w2v
100 columns
4.065312 MB

Assesing dataset d2v
100 columns
4.065312 MB

Assesing dataset glove
300 columns
12.036512 MB

