In [None]:
def document_search(document, predicate):
    #returns first matched sentence
    for sentence in document:
        words = sentence.split()
        if predicate(words):
            return words

In [None]:
import pandas as pd
import scipy.sparse as sp
from wordcloud import WordCloud
import operator
from nltk import bigrams
from collections import defaultdict, Counter

In [None]:
import csv
import glob
import chardet
import re
import os
from tqdm import tqdm
csv.field_size_limit(1<<21)

In [None]:


def allowed_characters(string: str) -> str:
    # returns only allowed characters and convert else to space
    # use regex
    return re.sub(r'[^가-힣.]', ' ', string)

def remove_single_character(string: str) -> str:
    # remove single character
    return re.sub(r'\b\w\b', '', string)

def prune(txt_file='result.txt', filename='result.csv',splitted_expected_length=4,target_column=3,truncate_threshold=10, max_lines=-1, start_line=0 ):
    """
    prune txt file to csv file
    :param txt_file: txt file to prune
    :param filename: csv file to save
    :param splitted_expected_length: expected length of splitted line, if not equal, skip
    :param target_column: target column to prune
    :param truncate_threshold: threshold to prune in word counter
    :param max_lines: max lines to prune
    :param cur_line: current line
    """
    word_counter = Counter()
    skipped_line_idxs = set()
    if not os.path.exists(filename):
        print("processing txt file to csv file")
        with open(txt_file, 'r', encoding='utf-8') as txtfile:
            total_lines = sum(1 for line in txtfile)
        with open(filename, 'w', encoding='utf-8', newline='') as csv_output:
            csv_writer = csv.writer(csv_output)
            with open(txt_file, 'r', encoding='utf-8') as txtfile:
                cur_line = 0
                for line in tqdm(txtfile, total=total_lines):
                    cur_line += 1
                    if cur_line < start_line:
                        continue
                    splits = line.split('\t')
                    if len(splits) != splitted_expected_length:
                        skipped_line_idxs.add(cur_line)
                        continue
                    # remove \n
                    splits[-1] = splits[-1].replace('\n', '')
                    # remove special characters
                    splits = [allowed_characters(s) for s in splits]
                    splits = [remove_single_character(s) for s in splits]
                    # apply splits to word counter
                    word_counter.update(splits[target_column].split())
                    # write to csv
                    csv_writer.writerow(splits)
                    if cur_line == max_lines:
                        break
        with open('skipped_line_idxs.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join((str(x) for x in skipped_line_idxs)))

    # prune and save to csv
    file_except_extension = os.path.splitext(filename)[-2]
    if not os.path.exists(file_except_extension + '_pruned.csv'):
        print("pruning csv file")
        with open(filename, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            with open(file_except_extension + '_pruned.csv', 'w', encoding='utf-8', newline='') as f2:
                writer = csv.writer(f2)
                for row in reader:
                    if len(row) < splitted_expected_length:
                        continue
                    line = row[target_column]
                    # apply threshold from word counter
                    line = ' '.join([word for word in line.split(' ') if word_counter[word] > truncate_threshold])
                    row[target_column] = line
                    if len(line) > 0:
                        writer.writerow(row)

txt_file = 'source.txt'
# convert txt file to .csv file, and save it
# we expect 20xxxxxx source text
# splits with \t


# process kookbang_all.txt
prune(txt_file=txt_file, splitted_expected_length=1, target_column=0, filename = 'source.csv')


In [None]:

processed = 'source.csv'
pruned = 'source_pruned.csv'

In [None]:
# compress rate
rate = os.path.getsize(pruned) / os.path.getsize(processed)
print(rate)

In [None]:
# test csv file with index 3
with open(pruned, 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    lines = 3
    for rows in reader:
        try:
            rows[0]
        except IndexError:
            print(lines)
        lines-=1
        if lines == 0:
            break


In [None]:
import treform as ptm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from ExampleManager import PathManager

In [None]:
import os
#os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-16.0.2'
#os.environ['GIT_PYTHON_GIT_EXECUTABLE'] = r'C:\Program Files\Git\bin\git.exe'
#import git

In [None]:
# test packages
str(PathManager('../stopwords/stopwordsKor.txt'))

In [None]:
class StopwordFilterBeta:
    """
    Stopword filter, removes if length is 1, or if it is in stopwords / with prefix
    """
    IN_TYPE = [list, str]
    OUT_TYPE = [list, str]

    def __init__(self, stopwords = [], file = None):
        if file:
            stopwords = stopwords + [line.strip() for line in open(file, encoding='utf-8')]
        self.stopwords = set(stopwords)
        self.stopwordsPrefix = ('http', 'https', 'ftp', 'git', 'thatt')

    def __call__(self, *args, **kwargs):
        #any(e for e in test_list if e.startswith('three') or e.endswith('four'))
        return [i for i in args[0] if len(i) > 1 and i.lower() not in self.stopwords and (i.lower().startswith(tuple(p for p in self.stopwordsPrefix)) == False)]

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
import pickle
if not os.path.exists("documents.pkl"):
    print("processing corpus to word histogram")
    corpus = ptm.CorpusFromCSVFile(pruned, 0)
    pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
                            ptm.tokenizer.Komoran(),
                            ptm.helper.POSFilter('NN*'),
                            ptm.helper.SelectWordOnly(),
                            StopwordFilterBeta(file=str(PathManager('../stopwords/stopwordsKor.txt')))
                            )
    result = pipeline.processCorpus(tqdm(corpus))
    documents = []
    for doc in result:
        for sent in doc:
            sentence = ' '.join(sent)
            sentence = re.sub('[^가-힣_ ]+', '', sentence)
            sentence = sentence.strip()
            if len(sentence) > 0:
                documents.append(sentence)
    print(len(documents))
    with open('documents.pkl', 'wb') as f:
        pickle.dump(documents, f)



In [None]:
with open('documents.pkl', 'rb') as f:
    documents = pickle.load(f)

In [None]:
document_arr = [tqdm((d.split(' ') for d in documents), total = len(documents)),]
from tqdm import tqdm
co = ptm.CooccurrenceManager()

In [None]:


from tqdm import tqdm
# clear tqdm (jupyter notebook)
tqdm._instances.clear()
from collections import defaultdict, Counter
from nltk.util import bigrams
import operator
def computeCooccurence(iterable:list[list[list[str]]], target:str=''):
    com:defaultdict[str, defaultdict[str, int]] = defaultdict(lambda: defaultdict(int))
    count_all = Counter()
    count_all1 = Counter()

    is_target_specific:bool = len(target) > 0

    uniqueList = []
    for _array in iterable:
        for line in _array:
            for word in line:
                if not is_target_specific:
                    if word not in uniqueList:
                        uniqueList.append(word)

            terms_bigram = bigrams(line)
            # Update the counter
            count_all.update(line)
            count_all1.update(terms_bigram)

            # Build co-occurrence matrix
            for i in range(len(line) - 1):
                for j in range(i + 1, len(line)):
                    w1, w2 = sorted([line[i], line[j]])
                    if w1 != w2:
                        com[w1][w2] += 1



    com_max = []
    # For each term, look for the most common co-occurrent terms
    for t1 in com:
        t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
        for t2, t2_count in t1_max_terms:
            # If target is provided, only show co-occurrences with target, else all co-occurrences
            if (is_target_specific and (target == t1 or target == t2)) or not is_target_specific:
                if t1 not in uniqueList:
                    uniqueList.append(t1)
                if t2 not in uniqueList:
                    uniqueList.append(t2)
                com_max.append(((t1, t2), t2_count))
    # Get the most frequent co-occurrences
    terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)

    return com, terms_max, uniqueList

result_cooc = computeCooccurence([tqdm((d.split(' ') for d in documents), total = len(documents)),])

In [None]:
cooc_result = result_cooc[0]
import sys
sys.getsizeof(cooc_result)

In [None]:
len(cooc_result.keys())

In [None]:
class WordDict(dict):
    # wordDict.putIfAbsent(string) -> int
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._index = 0
        self.reverse = {}
    def _setreverse(self, index, key):
        self.reverse[index] = key
    def __missing__(self, key):
        super().__setitem__(key, self._index)
        self._setreverse(self._index, key)
        self._index += 1
        return self[key]
    def __getitem__(self, key):
        # fallback to __missing__
        if key not in self:
            self[key] = self._index
            self._setreverse(self._index, key)
            self._index += 1
        return super().__getitem__(key)
    def getKey(self, index):
        return self.reverse[index]

In [None]:
# convert to sparse matrix, then to csr matrix
# {key1 : {key2 : count}} structure -> (key1, key2, count) structure
# we will filter keys
from functools import lru_cache

def convert_2d_json(structure, predicate= lambda x : True):
    wordIndexes = WordDict()
    result = []
    tqdm._instances.clear()
    for key1 in tqdm(structure.keys()):
        if not predicate(key1):
            continue
        idx1 = wordIndexes[key1]
        for key2 in structure[key1].keys():
            if predicate(key2) and structure[key1][key2] > 0:
                idx2 = wordIndexes[key2]
                result.append((idx1, idx2, structure[key1][key2]))
    return result, wordIndexes
# lets exclude strings with all numeric characters such as '000' and strings and only english characters
@lru_cache(maxsize=70051)
def filterfunc(x):
    alphabet_re = re.compile('[a-zA-Z]+')
    return not x.isnumeric() and not alphabet_re.match(x)

sparse, wordIndexes = convert_2d_json(cooc_result, filterfunc)

In [None]:
import numpy as np
# find max index
def reverse_dict(d):
    return {v: k for k, v in d.items()}

indexToWord = reverse_dict(wordIndexes)

In [None]:

# Now we will convert to csr matrix
# We need 'dictionary' to convert string to integer
len(wordIndexes)
import scipy.sparse as sp
# sp.csr_matrix(data, (row, col)) -> matrix
# (row, col, data) -> csr_matrix
arr = np.array(sparse)
maxidx = max(arr[:,0].max(), arr[:,1].max())
mat = sp.csr_matrix((arr[:,2], (arr[:,0], arr[:,1])), shape=(maxidx+1, maxidx+1))
# check if it is symmetric

In [None]:
from matplotlib import pyplot as plt
# analyze zipf's law with word counter
from analyzeZipf import plot_zipf
plt.rc('font', family='Malgun Gothic')
plot_zipf(result, title="Zipf's law for all nouns", prune=60)

In [None]:
'가나다'.isalpha()

In [None]:
# check if it is symmetric
# sort by count and get submatrix
def getSubMatrix(array, n, maxCount = 1e5):
    subarray = array[array[:,2] < maxCount]
    # sort by mat[:,2], then get first n elements
    # then get mat[:,0] and mat[:,1]
    sorted_idx = np.argsort(subarray[:,2])
    return subarray[sorted_idx[-n:], :]

top_100 = getSubMatrix(arr, 100)

In [None]:
# using indexToWord, parse it to string
def parseIndexToWord(array, indexDict):
    result = []
    for row in array:
        result.append((indexDict[row[0]], indexDict[row[1]], row[2]))
    return result

# histogram of counts
def getHistogram(array):
    return np.histogram(array[:,2], bins=10)

In [None]:
getHistogram(arr)

In [None]:
parseIndexToWord(getSubMatrix(arr, 10,10000), indexToWord)

In [None]:
# dump
import json
import os


In [None]:
if not os.path.exists("word_hist.pkl"):
    cv = CountVectorizer(max_features=250)
    cv_fit = cv.fit_transform(documents)
    word_list = cv.get_feature_names_out()
    count_list = cv_fit.sum(axis=0)

In [None]:
assert all(isinstance(a, str) for a in documents)
assert all(len(a) > 0 for a in documents)

In [None]:
word_hist = dict(zip(word_list, count_list.tolist()[0]))

In [None]:
len(word_list)

In [None]:
#dump word_hist
import pickle
import os
if not os.path.exists('word_hist.pkl'):
    with open('word_hist.pkl', 'wb') as f:
        pickle.dump(word_hist, f)

# reload
with open('word_hist.pkl', 'rb') as f:
    word_hist = pickle.load(f)

In [None]:
from tqdm import tqdm
def calculateCooccurrence(self:ptm.CooccurrenceManager, iterable):
    count = {}  # 동시출현 빈도가 저장될 dict
    words = list(set(iterable))  # 단어별로 분리한 것을 set에 넣어 중복 제거하고, 다시 list로 변경
    wids = [self.getIdOrAdd(w) for w in tqdm(words)]
    for i, a in enumerate(tqdm(wids)):
        for b in wids[i + 1:]:
            if a == b: continue  # 같은 단어의 경우는 세지 않음
            if a > b: a, b = b, a  # A, B와 B, A가 다르게 세어지는것을 막기 위해 항상 a < b로 순서 고정
            count[a, b] = count.get((a, b), 0) + 1  # 실제로 센다

    sorted = []
    for tup in tqdm(count):
        freq = count[tup]
        left_word = self.getWord(count[0])
        right_word = self.getWord(count[1])
        sorted.append(((left_word, right_word), freq))
    return sorted, words

In [None]:

if not os.path.exists('documents_export.csv'):
    print("processing documents to csv file")
    with open('documents_export.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        for doc in tqdm(documents):
            split_line = doc.split(' ')
            # remove 0-1 length words, and remove words with alphabet or which contains number
            # remove some stopwords
            alphabet = re.compile('[a-zA-Z]')
            number = re.compile('[0-9]')
            split_line = [word for word in split_line if len(word) > 1 and not alphabet.search(word) and not number.search(word)]
            if len(split_line) > 0:
                writer.writerow(split_line)

########################################################################################################################

In [None]:
document_search(documents, lambda x: '인도' in x and '태평양' in x)

In [None]:
document_search(documents, lambda x: '농협' in x and '일본' in x)
