In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import  train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier


In [21]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

# Read the file and convert it to a DataFrame
df = pd.read_csv('kdt-NLANU-0.01.connlu.txt\kdt-NLANU-0.01.connlu.txt', 
                 sep='\t', 
                 names=columns, 
                 skip_blank_lines=True)

# Drop rows where 'WORD' is NaN
df = df.dropna(subset=['WORD'])

df = df[df['POS'] != 'PUNCT']

# Characters to remove
chars_to_remove = r"[\#\$\%\&\(\)\+\,\-\.\–\’\:\@]"

# Removing the characters from the 'WORD' column
df['WORD'] = df['WORD'].str.replace(chars_to_remove, '', regex=True)


#df['ID'] = df['ID'].astype(int)

# Display the first few rows of the cleaned DataFrame
df.head()

  df = pd.read_csv('kdt-NLANU-0.01.connlu.txt\kdt-NLANU-0.01.connlu.txt',


Unnamed: 0,ID,WORD,LEMMA,POS,XPOS,MORPH,HEAD,DEPREL,DEPS,MISC
1,1,ҚТЖ,ҚТЖ,PROPN,PROPN,_,4,nsubj,_,_
2,2,халықаралық,халықаралық,ADJ,ADJ,_,3,amod,_,_
3,3,серіктестікті,серіктестік,NOUN,NOUN,Case=Acc,4,dobj,_,_
4,4,кеңейтуде,кеңей,VERB,VERB,Person=3|vbTense=Aor|vbVcCaus=True,0,root,_,_
6,1,160,160,NUM,NUM,_,2,compound,_,_


In [22]:
def get_values(df):
    X_lex = df['WORD'].str.strip()
    X_lex = X_lex.values

    y_lex = df['POS'].str.strip()
    y_lex = y_lex.values

    return X_lex, y_lex
df.shape[0] 

X_lex, Y_lex = get_values(df)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_lex, Y_lex, test_size=0.1, random_state=42)
    
#get max word length
max_word_len=max(max([len(w) for w in Y_lex]),max([len(w) for w in X_lex]))

#Char2vec model
vectorizer = TfidfVectorizer(lowercase=False, analyzer='char')
X = vectorizer.fit_transform(X_lex)
dic=vectorizer.get_feature_names_out()#letter dictionary
num_letters=len(dic)
mx=X.T.dot(X) #letter cooccurence matrix
mx=mx.toarray()
    

In [24]:
mx.shape

(147, 147)

In [25]:
dic

array([' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
       'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
       'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b',
       'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
       'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '³', 'ë',
       'Ё', 'І', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К',
       'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
       'Ш', 'Щ', 'Ы', 'Э', 'Ю', 'Я', 'а', 'б', 'в', 'г', 'д', 'е', 'ж',
       'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у',
       'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё',
       'і', 'Ғ', 'ғ', 'Қ', 'қ', 'ң', 'Ү', 'ү', 'Ұ', 'ұ', 'Һ', 'һ', 'Ә',
       'ә', 'Ө', 'ө', '№'], dtype=object)

In [26]:
#vec encoding of words
def alpha_vec2(w, mx, max_word_len, dic):
    vec = np.zeros((max_word_len, len(dic)))    
    for i in range(0, len(w)):
        vec[i] = mx[np.where(dic == w[i])[0][0]]
    vec = vec.astype('float16').flatten()
        
    vec=vec.astype('float16').flatten()
    vec[vec==np.inf]=0 
    vec[vec==-np.inf]=0        
    return vec



#ordinal encoding of words
def alpha_vec2ord(w, max_word_len):
    vec=np.zeros(max_word_len)    
    for i in range(0, len(w)):        
        vec[i]=ord(w[i])    
    return vec.astype('int')

In [27]:
#Vectorize
X_lex_vec_train=[alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
Y_lex_vec_train=[alpha_vec2ord(w, max_word_len) for w in y_train]

X_lex_vec_test=[alpha_vec2(w, mx, max_word_len, dic) for w in X_test]
Y_lex_vec_test=[alpha_vec2ord(w, max_word_len) for w in y_test]

  vec = vec.astype('float16').flatten()


In [33]:
#Build model
best_model=ExtraTreesClassifier(n_estimators=10,
                                n_jobs=-1,
                                criterion='entropy',
                                bootstrap=True,
                                verbose=1)

best_model.fit(X_lex_vec_train, 
               Y_lex_vec_train)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.


KeyboardInterrupt: 

In [None]:

#Test
predicts_test=best_model.predict(X_lex_vec_test)
predicts_train=best_model.predict(X_lex_vec_train)
test_acc=sum([sum(p==y)==max_word_len for p,y in zip(predicts_test, Y_lex_vec_test)])/len(predicts_test)
train_acc=sum([sum(p==y)==max_word_len for p,y in zip(predicts_train, Y_lex_vec_train)])/len(predicts_train)