# **Kazakh POS tagger**

In [1]:
import nltk
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from joblib import dump

In [2]:
# Parse the Kazakh Dependency Treebank in CoNLL-U format
def parse_conllu(file_path):
    sentences = []
    current_sentence = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == '':  # New sentence
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                # CoNLL-U format (columns: ID, FORM, LEMMA, UPOS, XPOS, etc.)
                parts = line.strip().split('\t')
                if len(parts) > 4:
                    word, pos_tag = parts[1], parts[3]  # FORM and UPOS
                    current_sentence.append((word, pos_tag))

    return sentences

In [3]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:3],
        'prefix-3': sentence[index][:4],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-3:],
        'suffix-3': sentence[index][-4:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [4]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [5]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []

    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])

    return X, y

In [7]:
kazakh_sentences = parse_conllu('kdt-NLANU-0.01.connlu.txt')

In [8]:
print(kazakh_sentences[0])

[('ҚТЖ', 'PROPN'), ('халықаралық', 'ADJ'), ('серіктестікті', 'NOUN'), ('кеңейтуде', 'VERB')]


In [9]:
cutoff = int(.75 * len(kazakh_sentences))
training_sentences = kazakh_sentences[:cutoff]
test_sentences = kazakh_sentences[cutoff:]

In [10]:
X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)

In [19]:
X_train

[{'word': 'ҚТЖ',
  'is_first': True,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': True,
  'is_all_lower': False,
  'prefix-1': 'Қ',
  'prefix-2': 'ҚТЖ',
  'prefix-3': 'ҚТЖ',
  'suffix-1': 'Ж',
  'suffix-2': 'ҚТЖ',
  'suffix-3': 'ҚТЖ',
  'prev_word': '',
  'next_word': 'халықаралық',
  'has_hyphen': False,
  'is_numeric': False,
  'capitals_inside': True},
 {'word': 'халықаралық',
  'is_first': False,
  'is_last': False,
  'is_capitalized': False,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': 'х',
  'prefix-2': 'хал',
  'prefix-3': 'халы',
  'suffix-1': 'қ',
  'suffix-2': 'лық',
  'suffix-3': 'алық',
  'prev_word': 'ҚТЖ',
  'next_word': 'серіктестікті',
  'has_hyphen': False,
  'is_numeric': False,
  'capitals_inside': False},
 {'word': 'серіктестікті',
  'is_first': False,
  'is_last': False,
  'is_capitalized': False,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': 'с',
  'prefix-2': 'сер',
  'prefix-3': 'сері',
  'suffix-1': 'і',
  'suf

In [20]:
y_train

['PROPN',
 'ADJ',
 'NOUN',
 'VERB',
 'NUM',
 'NUM',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'ADJ',
 'NOUN',
 'PRON',
 'ADJ',
 'ADJ',
 'NOUN',
 'PUNCT',
 'NOUN',
 'CONJ',
 'NOUN',
 'NOUN',
 'NOUN',
 'VERB',
 'AUX',
 'AUX',
 'NOUN',
 'VERB',
 'AUX',
 'PUNCT',
 'PROPN',
 'NOUN',
 'NUM',
 'NOUN',
 'VERB',
 'AUX',
 'PUNCT',
 'PRON',
 'NOUN',
 'PRON',
 'NOUN',
 'PUNCT',
 'NOUN',
 'NOUN',
 'PUNCT',
 'PRON',
 'NOUN',
 'NOUN',
 'VERB',
 'PUNCT',
 'PROPN',
 'PUNCT',
 'NOUN',
 'PUNCT',
 'PROPN',
 'NOUN',
 'PUNCT',
 'PROPN',
 'VERB',
 'PUNCT',
 'NOUN',
 'ADJ',
 'PUNCT',
 'NOUN',
 'PUNCT',
 'PUNCT',
 'PROPN',
 'NOUN',
 'NOUN',
 'VERB',
 'PUNCT',
 'PROPN',
 'NOUN',
 'ADV',
 'PUNCT',
 'VERB',
 'NOUN',
 'NOUN',
 'PROPN',
 'PROPN',
 'NOUN',
 'PUNCT',
 'ADJ',
 'PROPN',
 'PUNCT',
 'NOUN',
 'VERB',
 'ADP',
 'NOUN',
 'VERB',
 'PUNCT',
 'PRON',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'VERB',
 'PUNCT',
 'PROPN',
 'NOUN',
 'NOUN',
 'PRON',
 'NOUN',
 'VERB',
 'NOUN',
 'NOUN',
 'PROPN',
 'PROPN',
 'PRON',
 '

In [11]:
len(X_train), len(y_train), len(X_test), len(y_test)

(702419, 702419, 232312, 232312)

In [12]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

In [13]:
clf.fit(X_train[:40000], y_train[:40000])  # Limit to 40K samples
print('Training completed')

Training completed


In [14]:
print("Accuracy:", clf.score(X_test[:20000], y_test[:20000]))

Accuracy: 0.89605


In [17]:
dump(clf, 'Kazakh_POS_Tagger.joblib')

['Kazakh_POS_Tagger.joblib']

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
def pos_tag(sentence):
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return list(zip(sentence, tags))

In [17]:
from nltk import word_tokenize
print(pos_tag(word_tokenize('Еңбек етсең ерінбей, тояды қарның тіленбей.')))

[('Еңбек', 'NOUN'), ('етсең', 'NOUN'), ('ерінбей', 'VERB'), (',', 'PUNCT'), ('тояды', 'NOUN'), ('қарның', 'NOUN'), ('тіленбей', 'AUX'), ('.', 'PUNCT')]


# NLTK approach

In [18]:
import nltk
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus.reader import TaggedCorpusReader

In [34]:
train_data = []
sent = []
for w, t in zip(X_train, y_train):
    if not w['is_last']:
        sent.append((w['word'], t))
    else:
        sent.append((w['word'], t))
        train_data.append(sent)
        sent = []
    #print((w['word'], t))

test_data = []
sent = []
for w, t in zip(X_test, y_test):
    if not w['is_last']:
        sent.append((w['word'], t))
    else:
        sent.append((w['word'], t))
        test_data.append(sent)
        sent = []
    

In [35]:
test_data

[[('Қос', 'ADJ'),
  ('қабат', 'NOUN'),
  ('тас', 'NOUN'),
  ('болат', 'NOUN'),
  ('үйдің', 'NOUN'),
  ('үстіңгі', 'ADJ'),
  ('бөлмесінде', 'NOUN'),
  ('электр', 'NOUN'),
  ('нұрына', 'NOUN'),
  ('шомылып', 'VERB'),
  (',', 'PUNCT'),
  ('қара', 'ADJ'),
  ('шолақ', 'ADJ'),
  ('бешпетін', 'NOUN'),
  ('жамылып', 'VERB'),
  (',', 'PUNCT'),
  ('Архимед', 'PROPN'),
  (',', 'PUNCT'),
  ('Торилел', 'PROPN'),
  (',', 'PUNCT'),
  ('Пифагор', 'PROPN'),
  (',', 'PUNCT'),
  ('Ньютон', 'PROPN'),
  (',', 'PUNCT'),
  ('Богданов', 'PROPN'),
  (',', 'PUNCT'),
  ('Каутский', 'PROPN'),
  (',', 'PUNCT'),
  ('Уйпперлермен', 'PROPN'),
  ('айналысып', 'VERB'),
  (',', 'PUNCT'),
  ('терең', 'ADJ'),
  ('ойға', 'NOUN'),
  ('бойлап', 'VERB'),
  ('отырған', 'AUX'),
  ('қошқыл', 'ADJ'),
  ('жігіт', 'NOUN'),
  ('кім', 'PRON'),
  ('?', 'PUNCT')],
 [('«', 'PUNCT'),
  ('Сен', 'PRON'),
  ('салар', 'VERB'),
  ('да', 'ADP'),
  (',', 'PUNCT'),
  ('мен', 'PRON'),
  ('салар', 'VERB'),
  ('»', 'PUNCT'),
  (',', 'PUNCT'),
  ('б

In [39]:
sentence = ["Қазақ", "халқының", "мәдени", "мұрасы"]

## Unigram Tagger

In [36]:
unigram_tagger = UnigramTagger(train_data)
print("Unigram Tagger Accuracy:", unigram_tagger.evaluate(test_data))

Unigram Tagger Accuracy: 0.9416345259822997


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Unigram Tagger Accuracy:", unigram_tagger.evaluate(test_data))


In [40]:
print(unigram_tagger.tag(sentence))

[('Қазақ', 'NOUN'), ('халқының', 'NOUN'), ('мәдени', 'ADJ'), ('мұрасы', 'NOUN')]


## Bigram Tagger (With Backoff)

In [37]:
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print("Bigram Tagger Accuracy:", bigram_tagger.evaluate(test_data))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Bigram Tagger Accuracy:", bigram_tagger.evaluate(test_data))


Bigram Tagger Accuracy: 0.95894744998106


In [41]:
print(bigram_tagger.tag(sentence))

[('Қазақ', 'NOUN'), ('халқының', 'NOUN'), ('мәдени', 'ADJ'), ('мұрасы', 'NOUN')]


## Trigram Tagger (Best for Context)

In [38]:
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print("Trigram Tagger Accuracy:", trigram_tagger.evaluate(test_data))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Trigram Tagger Accuracy:", trigram_tagger.evaluate(test_data))


Trigram Tagger Accuracy: 0.9634284927166914


In [42]:
print(trigram_tagger.tag(sentence))

[('Қазақ', 'NOUN'), ('халқының', 'NOUN'), ('мәдени', 'ADJ'), ('мұрасы', 'NOUN')]


## Results

|Model|Test accuracy|
|-----|-------------|
|DTC|0.89605|
|UnigramTagger|0.9416|
|BigramTagger|0.9589|
|TrigramTagger|0.9634|