# Setup

In [32]:
# -*- coding: utf-8 -*-
import gensim
import os
import re
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib
import deepcut
import json
import pickle
import csv
import math

from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

font_list = fm.createFontList(['THSarabunNew.ttf'])
fm.fontManager.ttflist.extend(font_list)
plt.rcParams['font.family'] = 'TH Sarabun New'
TAG_RE = re.compile(r'<[^>]+>')


# Cleanup

In [22]:
from bs4 import BeautifulSoup
import csv
import os
import time

def cleanup():
    top_directory = './thai-wiki'

    start_time = time.time()
    with open('updates.txt', 'w', encoding='utf-8') as f:
        outfileWriter = csv.writer(f, delimiter=' ',lineterminator='\n')
        i=0
        for root, dirs, files in os.walk(top_directory):
            for fname in filter(lambda fname: fname.endswith('.xml'), files):
                document = open(os.path.join(root, fname),"r",encoding='utf-8').read()
                soup = BeautifulSoup(document,'xml')
                for sentence in soup.body.findAll("se"):
                    rv = []
                    for word in sentence.findAll("w"):
                        rv.append(word.get_text())
                    if len(rv)>3:
                        outfileWriter.writerow(rv)
                i+=1
                if i%100 == 0:
                    print("Progress: ", i)
                    print("Elapsed: ", time.time() - start_time)

# Main functions Word2Vec

In [23]:
#Main functions
def remove_tags(text):
    a = TAG_RE.sub('', text)
    b = a.strip('\n')
    return b

def gen_cor_NECTEC():
    rv = []
    types = ['./NECTEC/novel','./NECTEC/article','./NECTEC/news','./NECTEC/encyclopedia']
    
    for name in types:
        corpus = []
        cleaned_corpus = []
        for filename in os.listdir(name):
            f = open('./' + name + '/' + filename, encoding="utf8") 
            corpus.append(f.read())
        for i in corpus:
            sub_sentence = i.split(' ')

            for sentence in sub_sentence:
                sub_string = sentence.split('|')
                sublist = []
                for string in sub_string:
                    if string != '':
                        sublist.append(remove_tags(string))
                rv.append(sublist)
    return rv

def gen_cor_wiki():
    file = open('updates.txt', 'r',encoding='utf-8') 
    trainset = []
    for line in file: 
        trainset.append((line.split()))
    return trainset

def train(documents,mincount):
    model = gensim.models.Word2Vec(
        documents,
        size=150,
        window=10,
        min_count=mincount,
        workers=10)
    model.train(documents, total_examples=len(documents), epochs=10)
    return model

def freqdf():
    rv = {}
    file = open('updates.txt', 'r',encoding='utf-8') 
    x = file.read().split()
    for word in x:
        rv[word] = rv.get(word,0) + 1
    #final = pd.DataFrame.from_dict(rv, orient='index')
    #final.rename(columns={0: "Freq"},inplace=True)
    return rv

# Main functions KMeans

In [24]:
from nltk.cluster import KMeansClusterer
import nltk 
import numpy as np
from sklearn.cluster import KMeans

def scikitkmean(model,n):
    kmeans = KMeans(n_clusters=n, random_state=0,n_init=20,max_iter=500).fit(model.wv.vectors)
    words = list(model.wv.index2word)
    df = pd.DataFrame({'word': words,'cluster': kmeans.labels_,})
    #df.set_index('word',inplace=True)
    err = kmeans.inertia_ 
    return df,err

def generatedic(df,maxi=500,mini=5,filt=True,cluster=100):
    dic = {}
    with open('./wikisave/thaiwiki.csv', 'w', encoding='utf-8') as f:
        outfileWriter = csv.writer(f, delimiter=',')
        for i in range(cluster):
            j = list(df[df['cluster']==i]['word'])
            if filt == True:
                if len(j)<maxi and len(j)>mini:
                    outfileWriter.writerow([str(i)]+j)
                    dic[i] = j
            else:
                outfileWriter.writerow([str(i)]+j)
                dic[i] = j
    return dic

# Save to file

In [25]:
#cleanup()

In [26]:
wikicorpus = gen_cor_wiki()
model = train(wikicorpus,75)
model.save("./wikisave/wiki.model")
print('Finished word2vec')

Finished word2vec


In [27]:
kmeandf,error = scikitkmean(model,200)
kmeandf.to_csv('./wikisave/wikidf.csv', encoding='utf-8')
print('Finished kmeans')

Finished kmeans


In [35]:
dictionary = generatedic(kmeandf,maxi=800,cluster=200)
output = open('./wikisave/dictionary.pkl', 'wb')
pickle.dump(dictionary, output)
output.close()
print('Finished dictionary')

Finished dictionary


In [36]:
freqdic = freqdf()
output = open('./wikisave/freqdictionary.pkl', 'wb')
pickle.dump(freqdic, output)
output.close()
print('Finished freqdictionary')

Finished freqdictionary
