In [1]:
import sentencepiece as spm
import pandas as pd
import numpy as np
import time

from sklearn.feature_extraction.text import *
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

sp = spm.SentencePieceProcessor()
sp.Load('../mongolian_bert_sentencepiece/mn_uncased.model')

def sp_tokenize(w):
    return sp.EncodeAsPieces(w)

In [2]:
import nltk
#nltk.download('punkt')

print(nltk.sent_tokenize("Сайн байна уу? Танд энэ өдрийн мэнд хүргье. Монгол текст ангилах гэж байна."))
print(nltk.word_tokenize("Монгол улсын их хурал"))

['Сайн байна уу?', 'Танд энэ өдрийн мэнд хүргье.', 'Монгол текст ангилах гэж байна.']
['Монгол', 'улсын', 'их', 'хурал']


In [5]:
df_path = '../../main_dataset/'
trained_model_and_fitted_encoder_path = '../models/'
plots_path = '../plots'

url = 'https://drive.google.com/file/d/1OX-s4n6Go_RsTPLlO48IclpCPqcXkkfo/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id=' + url.split('/')[-2]

df = pd.read_csv(path)
df.head()

(80036, 2)

In [4]:
vocab = "B абвгдеёжзийклмноөпрстуүфхцчшъыьэюя"  # B: blank
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}


def convert_text(text):
    text = text.lower()
    # ignore all characters which is not in the vocabulary
    return [char2idx[char] for char in text if char != 'B' and char in char2idx]


new_text = [convert_text(text) for text in df['content']]
# Checking if content is written in latin letters
converted_text_sum = [i for i in range(len(new_text)) if sum(new_text[i]) == len(new_text[i])]

# Dropping contents written in latin letters
df = df.drop(df.index[converted_text_sum])
df = df.reset_index(drop = True)

In [5]:
import string

stopwordsmn = ['аа','аанхаа','алив','ба','байдаг','байжээ','байна','байсаар','байсан',
               'байхаа','бас','бишүү','бол','болжээ','болно','болоо','бэ','вэ','гэж','гэжээ',
               'гэлтгүй','гэсэн','гэтэл','за','л','мөн','нь','тэр','уу','харин','хэн','ч',
               'энэ','ээ','юм','үү','?','', '.', ',', '-','ийн','ын','тай','г','ийг','д','н',
               'ний','дээр','юу']

df_preprocessed           = []
df_preprocessed_stopwords = []
word_dict   = {}

for idx, row in df.iterrows():
    news  = row['content']
    label = row['type_text']
    sentences = nltk.sent_tokenize(news)
    content_sentences           = []
    content_sentences_stopwords = []
    
    for sentence in sentences:
        tokens   = nltk.word_tokenize(sentence)
        tokens   = [w.lower() for w in tokens]
        table    = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words    = [word for word in stripped if word.isalpha()]
        words_stopwords = [w for w in words if not w in stopwordsmn]
        
        content_sentences.append(words)
        content_sentences_stopwords.append(words_stopwords)
        
        for w in words:
            word_dict[w] = 0
            
    df_preprocessed.append([content_sentences, label])
    df_preprocessed_stopwords.append([content_sentences_stopwords, label])

In [20]:
import pickle

with open('../dataset/1111_dataset.pickle', 'wb') as handle:
    pickle.dump(df_preprocessed, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("saved to 1111_dataset.pickle")

saved to 1111_dataset.pickle


In [21]:
with open('../dataset/1111_stopwords_removed.pickle', 'wb') as handle:
    pickle.dump(df_preprocessed_stopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("saved to 1111_stopwords_removed.pickle")

saved to 1111_stopwords_removed.pickle


In [10]:
word_index = {}
word_index["<PAD>"   ] = 0
word_index["<START>" ] = 1
word_index["<UNK>"   ] = 2
word_index["<UNUSED>"] = 3
cnt = 4
for k, v in word_dict.items():
    word_index[k] = cnt
    cnt += 1

#print(word_index)

In [11]:
reversed_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [24]:
with open('../dataset/word_index.pickle', 'wb') as handle:
    pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("saved to word_index.pickle")
    
with open('../dataset/reversed_word_index.pickle', 'wb') as handle:
    pickle.dump(reversed_word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("saved to reversed_word_index.pickle")

saved to word_index.pickle
saved to reversed_word_index.pickle
