# Packages:

In [None]:
try:
  from google.colab import drive
  !nvidia-smi
  drive.mount('/content/drive')
  path = 'drive/MyDrive/Thesis WU/'
except:
  path = './'

Tue Jul 11 13:23:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    45W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from nltk import download
download('punkt')

In [None]:
# Packages for loading data:
import pickle

# Packages for effective data storage / math utils:
import pandas as pd
import numpy as np

# Packages for plotting:
import seaborn as sns
import matplotlib.pyplot as plt

# Packages for text representation:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation
from gensim.models import Word2Vec, Doc2Vec
import gensim.downloader as api

# Misc.:
import time
import multiprocessing

seed = 101
cores = multiprocessing.cpu_count()

# Text Representation (For classical ML)

In [None]:
df_train_x = pd.read_pickle(path + "ECHR_Dataset_clean/df_train_x.pkl")
df_train_y = pd.read_pickle(path + "ECHR_Dataset_clean/df_train_y.pkl")
df_test_x  = pd.read_pickle(path + "ECHR_Dataset_clean/df_test_x.pkl")
df_test_y  = pd.read_pickle(path + "ECHR_Dataset_clean/df_test_y.pkl")

* Bag-of-ngarms:
  * ngram (1, 1) (- PCA) (+ Truncated SVD) (+ LDA)
  * ngram (1, 2) (- PCA) (+ Truncated SVD) (+ LDA)
* TF-IDF:
  * ngram (1, 1) (- PCA) (+ Truncated SVD) (+ LDA)
  * ngram (1, 2) (- PCA) (+ Truncated SVD) (+ LDA)

In [None]:
def text_rep_pipe(
    corpus_train,
    corpus_test,
    ngram_size,
    vectorizer,
    dim_red_method = None,
):
    if vectorizer == 'BoW':
        vec = CountVectorizer(
            ngram_range=(1, ngram_size),
            min_df = 3,
            max_df = 0.95,
            dtype =  np.int16,
        )
    elif vectorizer == 'TFIDF':
        vec = TfidfVectorizer(
            ngram_range=(1, ngram_size),
            min_df = 3,
            max_df = 0.95,
            dtype = np.float32
        )
    else:
        return("Wrong vectorizer input")
    bow_matrix_train = abs(vec.fit_transform(corpus_train))
    bow_array_train = bow_matrix_train.toarray()

    bow_matrix_test = vec.transform(corpus_test)
    bow_array_test = bow_matrix_test.toarray()

    print("Vec Done")


    if dim_red_method == 'tSVD':
        tsvd_algo = TruncatedSVD(algorithm = 'randomized', n_components = 3000)
        tsvd_train = tsvd_algo.fit_transform(bow_matrix_train.asfptype())
        bow_df_train = pd.DataFrame(data=tsvd_train)

        tsvd_test = tsvd_algo.transform(bow_array_test)
        bow_df_test = pd.DataFrame(data=tsvd_test)

    elif dim_red_method == 'LDA':
        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4597325/
        if vectorizer == 'BoW': n_topics = 35
        elif vectorizer == 'TFIDF': n_topics = 5
        lda_algo = LatentDirichletAllocation(learning_method = 'online', n_components = n_topics)
        lda_train = lda_algo.fit_transform(bow_array_train)
        bow_df_train = pd.DataFrame(data=lda_train)

        lda_test = lda_algo.transform(bow_array_test)
        bow_df_test = pd.DataFrame(data=lda_test)

    else:
        bow_df_train = pd.DataFrame(data=bow_array_train, columns = vec.get_feature_names_out())
        bow_df_test = pd.DataFrame(data=bow_array_test, columns = vec.get_feature_names_out())

    print("Dim Red Done")

    return(bow_df_train, bow_df_test)

In [None]:
# Getting n_components for tSVD (manual)
vec = TfidfVectorizer(ngram_range=(1, 2), min_df = 3, max_df = 0.95, dtype = np.float32)
bow_matrix_train = vec.fit_transform(df_train_x)
bow_array_train = bow_matrix_train.toarray()
bow_df_train = pd.DataFrame(data=bow_array_train, columns = vec.get_feature_names_out())

tsvd_algo = TruncatedSVD(algorithm = 'randomized', n_components = 2500)
tsvd_train = tsvd_algo.fit_transform(bow_matrix_train.asfptype())

cumsum_exp_var = np.cumsum(tsvd_algo.explained_variance_ratio_)
cumsum_exp_var

n_th_comp = next(x[0] for x in enumerate(cumsum_exp_var) if x[1] > 0.8) + 1
n_th_comp # 80%

In [None]:
# Getting n_components for LDA (manual)
vec = TfidfVectorizer(ngram_range=(1, 1), min_df = 3, max_df = 0.95, dtype = np.float32)
bow_matrix_train = vec.fit_transform(df_train_x)
bow_array_train = bow_matrix_train.toarray()
bow_df_train = pd.DataFrame(data=bow_array_train, columns = vec.get_feature_names_out())

x = [5, 10, 15, 20, 25]
log_lik = []
for n_topics in x:
    lda_algo = LatentDirichletAllocation(learning_method = 'online', n_components = n_topics)
    lda_train = lda_algo.fit_transform(bow_array_train)
    log_lik.append(lda_algo.score(bow_array_train))

print(log_lik, np.argmax(log_lik))

In [None]:
#bow_uni_train_x,      bow_uni_test_x          = text_rep_pipe(df_train_x, df_test_x, 1, 'BoW', None)
#bow_uni_train_x_tsvd, bow_uni_test_x_tsvd     = text_rep_pipe(df_train_x, df_test_x, 1, 'BoW', 'tSVD')
#bow_uni_train_x_lda,  bow_uni_test_x_lda      = text_rep_pipe(df_train_x, df_test_x, 1, 'BoW', 'LDA')

#bow_bi_train_x,      bow_bi_test_x            = text_rep_pipe(df_train_x, df_test_x, 2, 'BoW', None)
#bow_bi_train_x_tsvd, bow_bi_test_x_tsvd       = text_rep_pipe(df_train_x, df_test_x, 2, 'BoW', 'tSVD')
#bow_bi_train_x_lda,  bow_bi_test_x_lda        = text_rep_pipe(df_train_x, df_test_x, 2, 'BoW', 'LDA')


#tfidf_uni_train_x,      tfidf_uni_test_x      = text_rep_pipe(df_train_x, df_test_x, 1, 'TFIDF', None)
#tfidf_uni_train_x_tsvd, tfidf_uni_test_x_tsvd = text_rep_pipe(df_train_x, df_test_x, 1, 'TFIDF', 'tSVD')
#tfidf_uni_train_x_lda,  tfidf_uni_test_x_lda  = text_rep_pipe(df_train_x, df_test_x, 1, 'TFIDF', 'LDA')

tfidf_bi_train_x,      tfidf_bi_test_x        = text_rep_pipe(df_train_x, df_test_x, 2, 'TFIDF', None)
#tfidf_bi_train_x_tsvd, tfidf_bi_test_x_tsvd   = text_rep_pipe(df_train_x, df_test_x, 2, 'TFIDF', 'tSVD')
#tfidf_bi_train_x_lda,  tfidf_bi_test_x_lda    = text_rep_pipe(df_train_x, df_test_x, 2, 'TFIDF', 'LDA')

Vec Done
Dim Red Done


In [None]:
tfidf_bi_train_x_tsvd.columns = tfidf_bi_train_x_tsvd.columns.map(str)
tfidf_bi_test_x_tsvd.columns  = tfidf_bi_test_x_tsvd.columns.map(str)

In [None]:
#bow_uni_train_x.to_parquet(path + "ECHR_Dataset_vec/bow_uni_train_x.parquet.gzip", compression='gzip', index = False)
#bow_uni_test_x.to_parquet(path + "ECHR_Dataset_vec/bow_uni_test_x.parquet.gzip", compression='gzip', index = False)
#bow_uni_train_x_tsvd.to_parquet(path + "ECHR_Dataset_vec/bow_uni_train_x_tsvd.parquet.gzip", compression='gzip', index = False)
#bow_uni_test_x_tsvd.to_parquet(path + "ECHR_Dataset_vec/bow_uni_test_x_tsvd.parquet.gzip", compression='gzip', index = False)
#bow_uni_train_x_lda.to_parquet(path + "ECHR_Dataset_vec/bow_uni_train_x_lda.parquet.gzip", compression='gzip', index = False)
#bow_uni_test_x_lda.to_parquet(path + "ECHR_Dataset_vec/bow_uni_test_x_lda.parquet.gzip", compression='gzip', index = False)

#bow_bi_train_x.to_parquet(path + "ECHR_Dataset_vec/bow_bi_train_x.parquet.gzip", compression='gzip', index = False)
#bow_bi_test_x.to_parquet(path + "ECHR_Dataset_vec/bow_bi_test_x.parquet.gzip", compression='gzip', index = False)
#bow_bi_train_x_tsvd.to_parquet(path + "ECHR_Dataset_vec/bow_bi_train_x_tsvd.parquet.gzip", compression='gzip', index = False)
#bow_bi_test_x_tsvd.to_parquet(path + "ECHR_Dataset_vec/bow_bi_test_x_tsvd.parquet.gzip", compression='gzip', index = False)
#bow_bi_train_x_lda.to_parquet(path + "ECHR_Dataset_vec/bow_bi_train_x_lda.parquet.gzip", compression='gzip', index = False)
#bow_bi_test_x_lda.to_parquet(path + "ECHR_Dataset_vec/bow_bi_test_x_lda.parquet.gzip", compression='gzip', index = False)


#tfidf_uni_train_x.to_parquet(path + "ECHR_Dataset_vec/tfidf_uni_train_x.parquet.gzip", compression='gzip', index = False)
#tfidf_uni_test_x.to_parquet(path + "ECHR_Dataset_vec/tfidf_uni_test_x.parquet.gzip", compression='gzip', index = False)
#tfidf_uni_train_x_tsvd.to_parquet(path + "ECHR_Dataset_vec/tfidf_uni_train_x_tsvd.parquet.gzip", compression='gzip', index = False)
#tfidf_uni_test_x_tsvd.to_parquet(path + "ECHR_Dataset_vec/tfidf_uni_test_x_tsvd.parquet.gzip", compression='gzip', index = False)
#tfidf_uni_train_x_lda.to_parquet(path + "ECHR_Dataset_vec/tfidf_uni_train_x_lda.parquet.gzip", compression='gzip', index = False)
#tfidf_uni_test_x_lda.to_parquet(path + "ECHR_Dataset_vec/tfidf_uni_test_x_lda.parquet.gzip", compression='gzip', index = False)

#tfidf_bi_train_x.to_parquet(path + "ECHR_Dataset_vec/tfidf_bi_train_x.parquet.gzip", compression='gzip', index = False)
#tfidf_bi_test_x.to_parquet(path + "ECHR_Dataset_vec/tfidf_bi_test_x.parquet.gzip", compression='gzip', index = False)
tfidf_bi_train_x_tsvd.to_parquet(path + "ECHR_Dataset_vec/tfidf_bi_train_x_tsvd.parquet.gzip", compression='gzip', index = False)
tfidf_bi_test_x_tsvd.to_parquet(path + "ECHR_Dataset_vec/tfidf_bi_test_x_tsvd.parquet.gzip", compression='gzip', index = False)
#tfidf_bi_train_x_lda.to_parquet(path + "ECHR_Dataset_vec/tfidf_bi_train_x_lda.parquet.gzip", compression='gzip', index = False)
#tfidf_bi_test_x_lda.to_parquet(path + "ECHR_Dataset_vec/tfidf_bi_test_x_lda.parquet.gzip", compression='gzip', index = False)

* ...2Vec algos
  * Word2Vec
  * Doc2Vec

In [None]:
w2v_model = Word2Vec(
    min_count=3,
    workers=cores - 1
)
w2v_model.build_vocab(
    df_train_x.apply(lambda x: x.split(" "))
)
w2v_model.train(
    df_train_x.apply(lambda x: x.split(" ")),
    total_examples = w2v_model.corpus_count,
    epochs = 30,
    report_delay=1
)

In [None]:
corpus_iterable = [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(df_train_x.apply(lambda x: x.split(" ")))]

d2v_model = Doc2Vec(
    min_count=3,
    workers=cores - 1
)
d2v_model.build_vocab(
    corpus_iterable = corpus_iterable,
)
d2v_model.train(
    corpus_iterable = corpus_iterable,
    total_examples = w2v_model.corpus_count,
    epochs = 30,
    report_delay=1
)

In [None]:
def w2v_vectorize(sentence):
    words = sentence.split(" ")
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

def d2v_vectorize(document):
    words_vecs = d2v_model.infer_vector(document.split())
    words_vecs = np.array(words_vecs)
    return words_vecs

In [None]:
w2v_train_x = pd.DataFrame(np.array([w2v_vectorize(doc) for doc in df_train_x]))
w2v_test_x  = pd.DataFrame(np.array([w2v_vectorize(doc) for doc in df_test_x]))
d2v_train_x = pd.DataFrame(np.array([d2v_vectorize(doc) for doc in df_train_x]))
d2v_test_x  = pd.DataFrame(np.array([d2v_vectorize(doc) for doc in df_test_x]))

In [None]:
w2v_train_x.columns = w2v_train_x.columns.map(str)
w2v_test_x.columns  = w2v_test_x.columns.map(str)
d2v_train_x.columns = d2v_train_x.columns.map(str)
d2v_test_x.columns  = d2v_test_x.columns.map(str)

In [None]:
w2v_train_x.to_parquet(path + "ECHR_Dataset_vec/w2v_train_x.parquet.gzip", compression='gzip', index = False)
w2v_test_x.to_parquet( path + "ECHR_Dataset_vec/w2v_test_x.parquet.gzip",  compression='gzip', index = False)
d2v_train_x.to_parquet(path + "ECHR_Dataset_vec/d2v_train_x.parquet.gzip", compression='gzip', index = False)
d2v_test_x.to_parquet( path + "ECHR_Dataset_vec/d2v_test_x.parquet.gzip",  compression='gzip', index = False)

* GloVe - Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download)

In [None]:
word_vectors = api.load("glove-wiki-gigaword-300")



In [None]:
def GloVe_vectorize(document):
    words = document.lower().split()
    vectors = [word_vectors[word] for word in words if word in word_vectors]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word_vectors.vector_size)

glove_train_x = pd.DataFrame(np.array([GloVe_vectorize(doc) for doc in df_train_x]))
glove_test_x  = pd.DataFrame(np.array([GloVe_vectorize(doc) for doc in df_test_x]))

In [None]:
glove_train_x.columns = glove_train_x.columns.map(str)
glove_test_x.columns  = glove_test_x.columns.map(str)

In [None]:
glove_train_x.to_parquet(path + "ECHR_Dataset_vec/glove_train_x.parquet.gzip", compression='gzip', index = False)
glove_test_x.to_parquet( path + "ECHR_Dataset_vec/glove_test_x.parquet.gzip",  compression='gzip', index = False)