In [1]:
#before you commence this process, be sure to have mallet installed and built using Apache Ant
#http://mallet.cs.umass.edu/download.php
#https://programminghistorian.org/en/lessons/topic-modeling-and-mallet
#https://ant.apache.org/manual/install.html
#import the necessary functions 
import warnings
import datetime
import matplotlib
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from os import listdir
from os.path import isfile, join
import os
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.test.utils import common_corpus
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

In [33]:
#define paths and variables
KUNARAC = "Kunarac"
BRIMA = "Brima"
BEMBA = "Bemba"
ALL_CASES = "All Cases"
VICTIMS = "victims"
JUDGEMENTS = "judgements"
WITNESSES = "Witnesses"
ALL_DOCS = "alldocs"
DEF_ITERATIONS = 1000
DEF_TOPICS = 10
os.environ['MALLET_HOME'] = "C:\\mallet"
PATH_TO_MALLET = "C:\\mallet\\bin\\mallet"
PATH_TO_TXT_MODELS = "C:\\results\\"
DATA_PATH = "C:\\data\\"
STOP_WORDS_PATH = "C:\\stopwords\\updated_stop_words.txt"
name = ALL_CASES
case = ALL_DOCS
processed_folders_path = "C:\\processedfolder"

In [3]:
SPECIAL_WORDS = ["was", "us", "as", "less", "has", "media", "Serbs", "muslims", "raped", "man", "commander", "worked",
                 "men", "does", "gave", "happened", "commander", "heard", "judes", "fo"]

stop_words = [[token.strip("\n") for token in open(STOP_WORDS_PATH, "r")], "è", "ñ", "á", "š", "â", "ý", "ž", "û", "ê", "ë", "õ", "ï", "ó", "þ", "ä", "ü", "î", "é", "ô", "ö", "ii", "mr", "eng", "alex-tamba-brima", "kunarac", "tf1", "pp", "icc", "witness", "wt", "steiner", "pppp", "presiding", "otp", "pursuant", "page", "judge", "honour", "scsl", "trial", "musa", "freetown", "redacted"]

In [4]:
def get_files(dir_path):
    """
    this method gets all the files in a folder and returns them as a list of documents
    :param dir_path: the path to the files
    :return: documents listed
    """
    # get all files in path
    file_names = [name for name in listdir(dir_path) if isfile(join(dir_path, name))]
    # open all files in directory
    all_files = []
    for file_name in file_names:
        print(file_name)
        n_file = open(dir_path + "\\" + file_name, "r", errors='ignore').readlines()
        all_files.append(n_file)
    return all_files


In [5]:
def discard_enumeration(file_list):
    """
    in case the file is enumerated (all lines start with numbers, we would want to discard them)
    in addition, it discards the \n in the end of the file.
    It also discards "procedural" line such as "open session" declarations.
    If the data is not enumerated, the files won't be changed.
    :param files: the data files  files to discard the enumeration from
    :return: the list of the files without the line numbers
    """
    new_files = []
    for file in file_list:
        if file is not None:
            lines = []
            i = 0
            for i in range(len(file)):
                line = file[i].lower()
                # delete lines announcing blank pages
                if line == "Blank page inserted to ensure pagination corresponds between the French and":
                    line = file[i+2].lower()
                if "open session" not in line:
                    if not ("ENG" in line and "CT" in line):
                        line = line.split(" ")
                        if line[0] != '\n':
                            if line[0].isdigit():
                                line = line[1:]
                    lines.append(" ".join(line))
            new_files.append(lines)
    return new_files


In [6]:
def tokenize_files(file_list):
    """
    creating tokenized lists from the lines of the documents.
    :param file_list:
    :return:
    """
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized = []
    for file in file_list:
        t_file = ""
        for line in file:
            t_file += line
        t_file = tokenizer.tokenize(t_file)
        t_file = [word.lower() for word in t_file]
        tokenized.append(t_file)
    return tokenized


In [7]:
def basic_NER(file_list):
    """
    A very basic and manual NER for the specific files we worked on.
    The function scans the file and looks for specific characters in the case that appear in more than one name
    and join them under a pre-defined name.
    :param file_list: the data in process
    :return: the list of files, with the specific names replaced.
    """
    names = {"radomir": "kovac", "klanfa" : "kovac",
             "zaga" : "kunarac", "dragoljub":"kunarac", "zoran" : "vukovic",
              "gullit": "alex-tamba-brima",
             "saj": "musa"}
    double_names = {"rojoj": "pass", "sierra": "leone", "buk": "bijela",
                    "cerova": "ravan", "lepa": "brena", "kp": "dom",
                    "junior": "lion"}
    triple_names = {"santigie": ["borbor", "kanu"]}
    window = 3
    new_files = []
    i = 0
    for file in file_list:
        new_file = []
        while i < len(file):
            word = file[i].lower()
            word_win = file[i - window:i + window]
            word_win = [w.lower() for w in word_win]
            if word == "leone":
                    new_file.append("sierra-leone")
            elif word in names:
                new_file.append(names[word])
            elif word in double_names:
                if double_names[word] in file[i-window:i+window] or "leonean" in file[i-window:i+window]:
                    new_file.append(word + "-" + double_names[word])
                    i += 1
            elif word in triple_names:
                new_file.append(word + "-" + triple_names[word][0] + "-" + triple_names[word][1])
                if triple_names[word][0] in word_win:
                    if triple_names[word][1] in word_win:
                        i += 2
                    else:
                        i += 1
            elif word == "alex":
                if 'brima' in word_win:
                    new_file.append("alex-tamba-brima")
                    i += 2
            elif word == "tamba":
                new_file.append("alex-tamba-brima")
                i+=1
            elif word == "ibrahim":
                if 'bazzy' in word_win or 'kamara' in word_win:
                    new_file.append("brima-kamara")
                    i += 2
            elif word == "sam" or word == "bockarie":
                if "bockarie" in word_win:
                    new_file.append("mosquito")
                    i += 1
            elif word == 'bazzy':
                new_file.append("brima-kamara")
                i += 1
            elif word == "brima":
                if 'et' and 'al' in word_win:
                    i += 2
            else:
                new_file.append(word)
            i+=1
        new_files.append(new_file)
        i = 0
    return new_files

In [8]:
def lemmatize_files(file_list):
    """
    This function
    :return:
    """
    # lematize all files
    lemmatizer = WordNetLemmatizer()
    lemmatized_files = []
    for file in file_list:
        new_file = []
        for word in file:
            if word.lower() in SPECIAL_WORDS:
                word = special_lemmatizer(word)
            else:
                word = lemmatizer.lemmatize(word)
            new_file.append(word)
        lemmatized_files.append(new_file)
    return lemmatized_files


In [9]:
def special_lemmatizer(word):
    """
    lemmatizer for the words that are wrongly lemmatized in the WORDNETLEMMATIZER.
    :param word: the word to lemmatize
    :return: the right form of the word after lemmatizing
    """
    word = word.lower()
    if word.lower() == "raped":
        return "rape"
    if word.lower() == "redaction":
        return "redacted"
    if word.lower() == "men":
        return "man"
    if word == "was":
        return "is"
    if word == "commander":
        return "command"
    if word == "worked":
        return "work"
    if word == "muslims":
        return "muslim"
    if word == "Serbs":
        return "Serb"
    if word == "men":
        return "man"
    if word == "gave":
        return "give"
    if word == "does" or word == "doent":
        return "do"
    if word == "happened":
        return "happen"
    if word == "vukovi":
        return "vukovic"
    if word == "herad":
        return "hear"
    if word == "fo":
        return "foca"
    if word =="dragoljub":
        return ""
    else:
        return word

In [10]:
def clean(files_list, stop_words):
    """
    This method deletes all stop words from the files
    :param files_list: the files to clean
    :param stop_words the list of words to discard
    :return:
    """
    clean_files = []
    for file in files_list:
        c_file = []
        for word in file:
            if word.lower() not in stop_words and "ÿ" not in word and not word.isdigit():
                c_file.append(word.lower())
        clean_files.append(c_file)
    return clean_files

In [11]:
def count_freq(files_list):
    """
    This function anlyzes and counts the frequencies of all the words in the data.
    This way, we could decide of the threshold for deleting too-frequent words from the data.
    :param files_list:
    """
    frequencies = {}
    for file in files_list:
        for word in file:
            if word not in frequencies:
                frequencies[word] = 1
            else:
                frequencies[word] += 1
    x = sorted((value, key) for (key, value) in frequencies.items())
    x.reverse()
    # # printing the actual frequencies
    # for w in x:
    #     print(w)

In [12]:
def check_words(file_list):
    all = {}
    for file in file_list:
        for line in file:
            line = line.split(" ")
            for word in line:
                word = word.strip()
                word = re.sub('[!#?.,-\\\]', '', word)
                word = word.title()
                if str(word) not in all:
                    all[str(word)] = 1
                else:
                    all[str(word)] += 1
    all = sorted((value, key) for (key, value) in all.items())
    all.reverse()
    print(all)
    print(len(all))
    return all

In [13]:
def create_data(path):
    """
    this method contains the process of cleaning and creating the data.
    :return: processed files.
    """
    file_list = get_files(path)
    file_list = discard_enumeration(file_list)
    file_list = tokenize_files(file_list)
    file_list = basic_NER(file_list)
    file_list = lemmatize_files(file_list)
    file_list = clean(file_list, stop_words)
    # freq = count_freq(file_list)
    return file_list



In [14]:
def create_lda_model(file_path, case="default", num_topics=10, iterations=5000, save=True):
    """
    create LDA model from a pre-processed data.
    @:param file_path: the path to the data file
    @:param num_topics: number of topics for the model
    @:param iterations: number of iterations the model will run
    @:param case: name of the case or data files we're working on, for saving
    @:param save: whether to save the model
    :return: the ready lda model
    """
    # create doc to term matrices
    dictionary = corpora.Dictionary(file_path)
    corpus = [dictionary.doc2bow(file) for file in file_path]
    model = LdaMallet(PATH_TO_MALLET, corpus=corpus, num_topics=num_topics,
                      id2word=dictionary, iterations=iterations)
    # get the date and time for saving
    now = datetime.datetime.now()
    if save:
        serial_name = name + "_" + str(num_topics) + "_topics_" + str(now.hour) + "h" + str(now.minute) + "m"
        print("saving as " + serial_name)
        print(" ")
        model.save(serial_name)
        save_topics_to_text(model, serial_name)
    return model

# define name for your intended output file name (eg I name = "Bemba", so the output file will be named Bemba)

In [15]:
def save_topics_to_text(lda_model, name):
    """
    This function saves the topics of a model in the wanted format, on a txt file named as the model
    :param lda_model: the model
    :param name: the name of the saved txt file
    :return:
    """
    topics = lda_model.print_topics(num_words=20)
    file = open(PATH_TO_TXT_MODELS + case + "\\" + name + ".txt", 'w+')
    for topic in topics:
        file.write("Topic " + str(topic[0]) + ": \n\n")
        words = topic[1].split("+")
        for i in range(len(words)):
            word = words[i].split("*")
            word = word[1].strip()
            word = word.replace("\"", "")
            file.write(word)
            if i < len(words)-1:
                file.write(", ")
            else:
                file.write(".")
            if i == 9:
                file.write("\n")
        file.write("\n\n")


In [16]:
ALL_CASE_DATA = create_data(DATA_PATH)

01.05.12_CAR‐V20‐PPPP‐0001_FPVD2.txt
01.06.11_CAR-OTP-PPPP-0209_MPWX5.txt
01.07.05_TF1-072_MPWD1.txt
01.07.05_TF1-072_MPWJ1.txt
01.07.05_TF1-072_MPWX1.txt
01.07.11_CAR-OTP-PPPP-0169_MPWD1.txt
01.08.06_DAB-023_MDWD2.txt
01.11.11_CAR-OTP-PPPP-0047_MPWX5.txt
01.11.13-CAR-D04-PPPP-0054_MDWX2.txt
01.11.13-CAR-D04-PPPP-0054_MVWD1.txt
01.12.10_CAR-OTP-PPPP-0022_FPVD1.txt
02.05.12_CAR-V20-PPPP-0001_FPVD3.txt
02.05.12_CAR-V20-PPPP-0001_FPVX4.txt
02.08.06_DBK-063_MDWD1.txt
02.08.06_DBK-063_MDWX1.txt
02.09.11_CAR-OTP-PPPP-0178_MPWD2.txt
02.09.11_CAR-OTP-PPPP-0178_MPWX3.txt
02.09.13-CAR-D04-PPPP-0030_FDWD2.txt
02.09.13-CAR-D04-PPPP-0030_FDWX1.txt
02.10.06_DAB-033_MDWD2.txt
02.10.06_DAB-033_MDWJ1.txt
02.10.06_DAB-033_MDWX2.txt
02.10.06_DAB-059_MDWX2.txt
02.10.06_DAB-137_MDWD1.txt
02.11.11_CAR-OTP-PPPP-0047_MPWX6.txt
03.02.11_CAR-OTP-PPPP-0082_FPVX1.txt
03.03.13-CAR-D04-PPPP-0019_MDWX3.txt
03.04.00_procedural discussion on witnesses.docx
03.04.00_W75_FPVD2.txt
03.04.00_W75_FPVX1.txt
03.05.00_W185_FP

12.05.11_CAR-OTP-PPPP-0063_MPWD1.txt
12.06.06_Brima_MDAD5.txt
12.06.13-CAR-D04-PPPP-0002_MDWD1.txt
12.06.13-CAR-D04-PPPP-0002_MDWX1.txt
12.07.00_Radinovic_MDED2.txt
12.07.00_Radinovic_MDEX2.txt
12.07.05_TF1-033_MPWJ1.txt
12.07.05_TF1-033_MPWX2.txt
12.07.05_TF1-055_MPWD1.txt
12.09.00_Dunjic_MDED2.txt
12.09.00_Dunjic_MDEX1.txt
12.09.00_Raskovic-Ivic_MDED1.txt
12.09.00_Raskovic-Ivic_MDED2.txt
12.09.00_Raskovic-Ivic_MDEX1.txt
12.09.06_DAB-101_MDWD1.txt
12.09.06_DAB-101_MDWX1.txt
12.09.06_DAB-123_MDWD2.txt
12.09.06_DAB-123_MDWX1.txt
12.09.06_DAB-128_MDWD1.txt
12.09.06_DAB-128_MDWX1.txt
12.09.06_DAB-130_FDWD1.txt
12.09.06_DAB-130_FDWX1.txt
12.09.12-CAR-D04-PPPP-0060_MDED2.txt
12.09.12-CAR-D04-PPPP-0060_MDEX1.txt
12.09.13-CAR-D04-PPPP-0015_MDWD2.txt
12.09.13-CAR-D04-PPPP-0015_MDWX1.txt
12.10.05_Iron_MPWD1.txt
12.10.06_DBK-005_MDWD2.txt
12.10.06_DBK-005_MDWX2.txt
12.10.06_DBK-126_FDWD2.txt
12.10.06_DBK-126_FDWX1.txt
12.10.06_DSK-113_MDWD1.txt
12.10.06_DSK-113_MDWX1.txt
12.11.13-CAR-D04-PPPP-00

19.03.12_CAR-OTP-PPPP-0036_MPWX1.txt
19.03.13-CAR-D04-PPPP-0045_MDWX3.txt
19.04.00_W78_MPWD1.txt
19.04.00_WAS_FPVD1.txt
19.04.00_WAS_FPVD2.txt
19.04.00_WAS_FPVX1.txt
19.04.05_TF1-053_MPWX1.txt
19.04.05_TF1-054_MPWD1.txt
19.04.05_TF1-054_MPWX1.txt
19.05.05_TF1-334_MPWD4.txt
19.05.11_CAR-OTP-PPPP-0063_MPWD5.txt
19.06.13-CAR-D04-PPPP-0003_MDWX1.txt
19.06.13-CAR-D04-PPPP-0004_MDWD2.txt
19.06.13-CAR-D04-PPPP-0004_MDWD3.txt
19.06.13-CAR-D04-PPPP-0004_MDWX1.txt
19.07.00_Mastillo_MDWD2.txt
19.07.00_Mastillo_MDWD3.txt
19.07.00_Mastillo_MDWX1.txt
19.07.05_TF1-045_MPWD1.txt
19.07.06_DAB-077_MDWD1.txt
19.07.06_DAB-077_MDWX1.txt
19.07.06_DBD-086_MDWD1.txt
19.07.06_DBD-086_MDWX1.txt
19.09.00_WDN_MDWD1.txt
19.09.00_WDN_MDWX1.txt
19.09.00_WDO_MDWD1.txt
19.09.00_WDO_MDWD2.txt
19.09.00_WDO_MDWX1.txt
19.09.00_WDO_MDWX2.txt
19.09.00_WDP_MDWD1.txt
19.09.00_WDV_FDWD2.txt
19.09.00_WDV_FDWX2.txt
19.09.05_TF1-167_MPWX1.txt
19.09.06_DAB-138_MDWD1.txt
19.09.06_DAB-138_MDWX1.txt
19.09.06_DAB-140_MDWD1.txt
19.09.0

26.07.00_Djurovic_MDWD1.txt
26.07.00_Djurovic_MDWD2.txt
26.07.00_Djurovic_MDWX1.txt
26.07.00_Pavlovic_MDWD1.txt
26.07.00_Pavlovic_MDWD2.txt
26.07.00_Pavlovic_MDWX1.txt
26.07.00_Przulj_MDWD1.txt
26.07.00_Przulj_MDWX1.txt
26.07.05_TF1-158_MPWD1.txt
26.07.05_TF1-158_MPWX1.txt
26.07.05_TF1-267_FPVD1.txt
26.08.11_CAR-OTP-PPPP-0173_MPWX5.txt
26.08.13-CAR-D04-PPPP-0025_MDWD1.txt
26.08.13-CAR-D04-PPPP-0025_MDWX1.txt
26.09.05_TF1-156_MPWD1.txt
26.09.05_TF1-156_MPWX1.txt
26.09.05_TF1-157_MPWX2.txt
26.09.05_TF1-184_MPWD1.txt
26.09.06_DAB-059_MDWD1.txt
26.09.06_DAB-059_MDWX1.txt
26.09.06_DBA-111_MDWD1.txt
26.09.06_DBA-111_MDWD2.txt
26.09.06_DBA-111_MDWX1.txt
26.09.11_CAR-OTP-PPPP-0032_MPWX3.txt
26.10.06_DBK-131_MDWD2.txt
26.10.06_DBK-131_MDWX3.txt
26.10.12-CAR-D04-PPPP-0051_MDWX3.txt
26.10.12-CAR-D04-PPPP-0051_MVWD1.txt
26.11.12-CAR-D04-PPPP-0016_MDWD1.txt
26.11.12-CAR-D04-PPPP-0016_MDWX1.txt
27.02.13-CAR-D04-PPPP-0019_MDWD3.txt
27.02.13-CAR-D04-PPPP-0019_MDWX1.txt
27.03.00_W52_MPWD2.txt
27.03.00_

In [17]:
ALL_CASE_DATA_model = create_lda_model(ALL_CASE_DATA, case="default", num_topics=10, iterations=5000, save=True)

saving as All Cases_10_topics_11h39m
 


In [18]:
def analyze_file(lda_model, file):
    """
    creates dictionaries and transforms the file to the valid gensim format for LDA,
    and analyzes it
    :param lda_model: the model according to
    :param file:
    :return:
    """
    dct = Dictionary([file])  # fit dictionary
    bow = dct.doc2bow(file)
    doc_lda = lda_model[bow]
    doc_lda.sort(key=lambda x: x[1], reverse=True)
    print(doc_lda)
    return doc_lda

In [27]:
def save_processed_files(file_list, doc_name, processed_folders_path):
    """
    save the clean and lemmatized files into a single text file - relevant for the topic
    modeling, not for the topic dist
    :param
    file_list the list of processed files
    """
    f = open(processed_folders_path + "\\" + doc_name + ".txt", "w")
    for file in file_list:
        for word in file:
            f.write(word)
            f.write(" ")
        f.write("\n")
    f.close()


In [29]:
save_processed_files(ALL_CASE_DATA,"all_cases",processed_folders_path)

In [30]:
def get_distributions(model):
    folders = []
    num_files = 0
    for name in LDA_modeler.names:
        folders.append(LDA_modeler.upload_from_file(name, LDA_modeler.PROCESSED_FILES_FOLDER))
        distribution_f = open(DIST_PATH, "w")
    for folder in folders:
        for file in folder:
            num_files += 1
            print(file)
            dist = LDA_modeler.analyze_file(model, file)
            distribution_f.write(", ".join(file))
            distribution_f.write("\n")
            str_dist = [str(dist[i][0]) + " - " + str(dist[i][1]) for i in range(len(dist))]
            distribution_f.write(", ".join(str_dist))
            distribution_f.write("\n")
    return num_files

In [35]:
print(STOP_WORDS_PATH)

C:\stopwords\updated_stop_words.txt


In [36]:
import LDA_modeler


NameError: name 'STOP_WORDS_PATH' is not defined