# Packages:

In [1]:
try:
  from google.colab import drive
  !nvidia-smi
  drive.mount('/content/drive')
  path = 'drive/MyDrive/Thesis/'
except:
  path = './'

/bin/bash: nvidia-smi: command not found
Mounted at /content/drive


In [2]:
# Packages for loading data:
from os import walk
import os
import pprint
import itertools
import json
import re
import pickle
import sys
import warnings

# Packages for effective data storage / math utils:
import pandas as pd
import numpy as np

# Packages for plotting:
import matplotlib.pyplot as plt
import seaborn as sns

# Packages for data cleaning:
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Packages for test train data prep:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Packages for text representation:
from sklearn.feature_extraction.text import CountVectorizer

# Misc.:
import time
import multiprocessing

seed = 101
cores = multiprocessing.cpu_count()

# Text Loading:

In [3]:
def file_path_getter(
    filepath : str,
    foldername : str
):
    #########
    # Input: filepath, path from notebook to raw data jsons
    # Output: lost of all paths to jasons to be used later
    #########

    filenames = next(walk(filepath + "/" + foldername), (None, None, []))[2]
    filenames = [str(filepath + "/" + foldername + "/" + file) for file in filenames]

    return(filenames)

filenames_train = file_path_getter("./ECHR_Dataset", "EN_train")
filenames_test = file_path_getter("./ECHR_Dataset", "EN_test")
filenames_dev = file_path_getter("./ECHR_Dataset", "EN_dev")

filenames_all = set(itertools.chain(filenames_train, filenames_test, filenames_dev))

In [None]:
cases_all = []
for filename in filenames_all:
    if "DS_Store" in filename:
        continue

    with open(filename) as f:
        data_temp = json.load(f)
    data_temp['TEXT'] = " ".join(data_temp['TEXT'])

    rem_list = ["VIOLATED_ARTICLES", "VIOLATED_PARAGRAPHS", "VIOLATED_BULLETPOINTS",
               "NON_VIOLATED_ARTICLES", "NON_VIOLATED_PARAGRAPHS", "NON_VIOLATED_BULLETPOINTS",]
    for key in rem_list:
        del data_temp[key]

    cases_all.append(data_temp)

cases_df_raw = pd.DataFrame(cases_all)
cases_df_raw['Num_words_text'] = cases_df_raw['TEXT'].apply(lambda x:len(str(x).split()))

In [None]:
new_CONCLUSION = []
for i in cases_df_raw.CONCLUSION:
    i = i.lower()
    if bool(re.search("(^violation)|(;violation)", i)):
        new_CONCLUSION.append(1)

    elif bool(re.search("^inadmissible", i)):
        new_CONCLUSION.append(0)

    elif (
        bool(re.search("^no violation", i)) or
        bool(re.search("^remainder inadmissible", i)) or
        bool(re.search("^inapplicable", i)) or
        bool(re.search("^lack of jurisdiction", i)) or
        bool(re.search("(?:^preliminary objection)(.*)(?:allow)", i)) or
        bool(re.search("(?:^preliminary objection)(.*)(?:dismiss)", i)) or
        bool(re.search("(?:^preliminary objection)(.*)(?:merit)", i)) or
        bool(re.search("(?:^preliminary objection)(.*)(?:reject)", i)) or
        bool(re.search("revision rejected", i)) or
        bool(re.search("pecuniary", i))
    ):
        new_CONCLUSION.append(0) #no violation

    else:
        new_CONCLUSION.append(np.nan)
cases_df_raw['new_CONCLUSION'] = np.array(new_CONCLUSION)
cases_df_raw = cases_df_raw.dropna(axis = 0, how = "any")

RUS = RandomUnderSampler(
    sampling_strategy='all',
    random_state=seed,
    replacement=False,
)

X_rus, Y_rus = RUS.fit_resample(cases_df_raw.loc[:, cases_df_raw.columns != 'new_CONCLUSION'], cases_df_raw.new_CONCLUSION)
cases_df_raw = pd.concat([X_rus.reset_index(drop=True), Y_rus], axis=1)

judges_dummy_df = pd.get_dummies(cases_df_raw.JUDGES.str.split(';').explode()).groupby(level=0).sum()
cases_df_raw = pd.concat([cases_df_raw.reset_index(drop=True), judges_dummy_df], axis=1)
cases_df_raw = cases_df_raw.loc[:, cases_df_raw.columns != 'JUDGES']

for col in cases_df_raw.columns:
    if len(cases_df_raw[col].unique()) == 1:
        cases_df_raw.drop(col, inplace=True, axis=1)

In [None]:
cases_df_raw.to_parquet("./ECHR_Dataset_clean/data_raw.parquet.gzip", compression='gzip', index = False)

# Text Cleaning:

In [None]:
cases_df_raw = pd.read_parquet("./ECHR_Dataset_clean/data_raw.parquet.gzip")
cases_df_clean = cases_df_raw.copy()

In [6]:
def remove_Stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize( text.lower() )
    sentence = [w for w in words if not w in stop_words]
    return " ".join(sentence)

def lemmatize_text(text):
    wordlist = []
    lemmatizer = WordNetLemmatizer()
    sentences = sent_tokenize(text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))
    return " ".join(wordlist)

def clean_text(text):
    delete_dict = {sp_character: '' for sp_character in string.punctuation}
    delete_dict[" "] = " "
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    textArr = text1.split()
    text2 = " ".join([w for w in textArr])

    return text2.lower()

In [None]:
cases_df_clean['TEXT'] = cases_df_clean['TEXT'].apply(clean_text)
cases_df_clean['TEXT'] = cases_df_clean['TEXT'].apply(remove_Stopwords)
cases_df_clean['TEXT'] = cases_df_clean['TEXT'].apply(lemmatize_text)
cases_df_clean['Num_words_text'] = cases_df_clean['TEXT'].apply(lambda x:len(str(x).split()))

In [None]:
df_train, df_test = train_test_split(cases_df_clean, test_size=0.2)
df_train_x = df_train.TEXT
df_train_y = df_train.new_CONCLUSION
df_test_x = df_test.TEXT
df_test_y = df_test.new_CONCLUSION

In [None]:
df_train_x.to_pickle("./ECHR_Dataset_clean/df_train_x.pkl")
df_train_y.to_pickle("./ECHR_Dataset_clean/df_train_y.pkl")
df_test_x.to_pickle("./ECHR_Dataset_clean/df_test_x.pkl")
df_test_y.to_pickle("./ECHR_Dataset_clean/df_test_y.pkl")

In [None]:
fig, axs = plt.subplots(1,2, sharex = True, sharey = True, figsize = (12, 7))

# color bars by outcome
cases_df_raw['Num_words_text'].plot(kind = "hist", bins=100, title='Raw - Word Count Distribution', ax = axs[0])
cases_df_clean['Num_words_text'].plot(kind = "hist", bins=100, title='Clean - Word Count Distribution', ax = axs[1])
plt.show()

## Distribution of Unigrams, Bigrams, Trigrams in clean vs raw:

In [None]:
def get_top_n_x_gram(corpus, x = 1, n=None):
    vec = CountVectorizer(ngram_range=(x, x)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
fig, axs = plt.subplots(1,2, sharey = True, figsize = (12, 7))

common_words = get_top_n_x_gram(cases_df_raw['TEXT'], 1, 20)
df1 = pd.DataFrame(common_words, columns = ['TEXT', 'count'])
df1.groupby('TEXT').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Unigrams - Raw - Top 20 words', ax = axs[0])

common_words = get_top_n_x_gram(cases_df_clean['TEXT'], 1, 20)
df1 = pd.DataFrame(common_words, columns = ['TEXT', 'count'])
df1.groupby('TEXT').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Unigrams - Clean - Top 20 words', ax = axs[1])
plt.show()

In [None]:
fig, axs = plt.subplots(1,2, sharey = True, figsize = (12, 7))

common_words = get_top_n_x_gram(cases_df_raw['TEXT'], 2, 20)
df1 = pd.DataFrame(common_words, columns = ['TEXT', 'count'])
df1.groupby('TEXT').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Bigrams - Raw - Top 20 words', ax = axs[0])

common_words = get_top_n_x_gram(cases_df_clean['TEXT'], 2, 20)
df1 = pd.DataFrame(common_words, columns = ['TEXT', 'count'])
df1.groupby('TEXT').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Bigrams - Clean - Top 20 words', ax = axs[1])
plt.show()

In [None]:
fig, axs = plt.subplots(1,2, sharey = True, figsize = (12, 7))

common_words = get_top_n_x_gram(cases_df_raw['TEXT'], 3, 20)
df1 = pd.DataFrame(common_words, columns = ['TEXT', 'count'])
df1.groupby('TEXT').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Trigrams - Raw - Top 20 words', ax = axs[0])

common_words = get_top_n_x_gram(cases_df_clean['TEXT'], 3, 20)
df1 = pd.DataFrame(common_words, columns = ['TEXT', 'count'])
df1.groupby('TEXT').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Trigrams - Clean - Top 20 words', ax = axs[1])
plt.show()