## Packages

In [1]:
import pandas as pd

from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import ExtraTreesClassifier

from functions import *

import warnings
warnings.filterwarnings("ignore")

## Import & clean data

In [2]:
path_kz = 'data/kdt-NLANU-0.01.connlu.txt'
path_en = 'data/en_ewt-ud-dev.conllu'
path_tu = 'data/tr_kenet-ud-dev.conllu'

path_data = [
    path_kz, 
    path_en, 
    path_tu
    ]

languages = [
    'kazakh',
    'english',
    'turkish'
    ]

dic_ = {}
for l in languages : 
    dic_[l] = {
            'test_acc' : '',
            'test_f1' : '',
            'train_acc' : '',
            "train_f1" : '',
            "Y" : '',
            "predicts" : '', 
            "list_tags" : ''
            }

In [None]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

for path, lang in tqdm(zip(path_data, languages)) :

    print("____________________________" , lang.upper(), "CORPUS ____________________________")

    # Read the file and convert it to a DataFrame
    df = pd.read_csv(path,
                    sep='\t',
                    names=columns,
                    skip_blank_lines=True
                    )

    # run the hand-made function to clean data
    X_lex, Y_lex = clean_data(df)
    
    # Split into train & test sets
    X_train, X_test, y_train, y_test = train_test_split(X_lex, 
                                                        Y_lex, 
                                                        test_size=0.1, 
                                                        random_state=42
                                                        )

    #get max word length
    max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

    #Char2vec model
    vectorizer = TfidfVectorizer(lowercase=False, 
                                analyzer='char'
                                )

    X = vectorizer.fit_transform(X_lex)
    dic = vectorizer.get_feature_names_out() # letter dictionary
    num_letters = len(dic)
    mx = X.T.dot(X) # letter cooccurence matrix
    mx = mx.toarray()

    #Vectorize X only
    X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
    X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

    # Encode Y
    list_tags = list_all_POS_tags(y = y_train)
    encoder_tag = LabelEncoder().fit(list_tags)

    Y_train = encoder_tag.transform(y_train)
    Y_test = encoder_tag.transform(y_test)

    # Build & train model
    best_model = ExtraTreesClassifier(n_estimators=10,
                                    n_jobs=-1,
                                    criterion='entropy',
                                    bootstrap=True
                                    )

    best_model.fit(X_lex_vec_train, Y_train)

    # predict both train and test sets
    predicts_test = best_model.predict(X_lex_vec_test)
    predicts_train = best_model.predict(X_lex_vec_train)
    
    test_acc, test_f1, train_acc, train_f1 = calculate_results(Y_test, 
                      Y_train, 
                      predicts_test, 
                      predicts_train
                      )
    print("Test Accuracy:", round(test_acc, 3))
    print("Test F1 Score:", round(test_f1, 3))
    print("Train Accuracy:", round(train_acc, 3))
    print("Train F1 Score:", round(train_f1, 3))


    fig = plot_confusion_matrix(Y_test, predicts_test, list_tags, 'Test set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_test')
    
    fig = plot_confusion_matrix(Y_train, predicts_train, list_tags, 'Train set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_train')


    df_tag_acc = per_tag_accuracy(Y_test, 
                              predicts_test, 
                              list_tags, 
                              encoder_tag
                              )

    display(df_tag_acc) # display accuracy per Tag
    
    df_tag_dist = tag_prediction_nb(
        Y_test, 
        predicts_test, 
        list_tags, 
        encoder_tag
        )

    display(df_tag_dist) # display the number of correct and incorect predictions for each tag


    fig = plot_dist_predictions(df_tag_dist,
                                lang)
    save_graph_to_folder(fig, lang, 'dist_predictions')
    
    
    mistake_freq_df = mistake_frequency_by_word_type(Y_test, 
                                                    predicts_test, 
                                                    list_tags, 
                                                    encoder_tag
                                                    )

    display(mistake_freq_df.head(n=10)) # Print 10 most frequent errors


0it [00:00, ?it/s]

____________________________ KAZAKH CORPUS ____________________________
Size dataset :  (10000, 10)
Test Accuracy: 0.792
Test F1 Score: 0.788
Train Accuracy: 0.965
Train F1 Score: 0.965


Unnamed: 0,Tag,Accuracy
0,ADV,0.5
1,PROPN,0.815534
2,NOUN,0.881081
3,ADJ,0.680328
4,VERB,0.75
5,NUM,0.742857
6,PRON,0.816327
7,ADP,1.0
8,SCONJ,1.0
9,AUX,0.487179


Unnamed: 0,Tag,Correct Predictions,Incorrect Predictions
0,ADV,15,15
1,PROPN,84,19
2,NOUN,326,44
3,ADJ,83,39
4,VERB,153,51
5,NUM,26,9
6,PRON,40,9
7,ADP,43,0
8,SCONJ,2,0
9,AUX,19,20


Unnamed: 0,From Tag,To Tag,Frequency
21,VERB,NOUN,38
15,ADJ,NOUN,29
10,NOUN,ADJ,19
32,AUX,VERB,17
11,NOUN,VERB,15
4,PROPN,NOUN,13
0,ADV,NOUN,10
26,NUM,PRON,7
28,PRON,NOUN,6
22,VERB,ADJ,6


1it [00:06,  6.01s/it]

____________________________ ENGLISH CORPUS ____________________________
Size dataset :  (10000, 10)
Test Accuracy: 0.821
Test F1 Score: 0.817
Train Accuracy: 0.946
Train F1 Score: 0.944


Unnamed: 0,Tag,Accuracy
0,ADP,0.946809
1,PROPN,0.772727
2,VERB,0.683099
3,ADJ,0.72619
4,NOUN,0.767568
5,SCONJ,0.277778
6,ADV,0.638298
7,NUM,1.0
8,CCONJ,1.0
9,DET,0.945652


Unnamed: 0,Tag,Correct Predictions,Incorrect Predictions
0,ADP,89,5
1,PROPN,68,20
2,VERB,97,45
3,ADJ,61,23
4,NOUN,142,43
5,SCONJ,5,13
6,ADV,30,17
7,NUM,28,0
8,CCONJ,34,0
9,DET,87,5


Unnamed: 0,From Tag,To Tag,Frequency
11,VERB,NOUN,31
21,NOUN,VERB,14
17,ADJ,NOUN,13
22,NOUN,ADJ,12
20,NOUN,PROPN,11
6,PROPN,NOUN,10
13,VERB,AUX,6
34,ADV,ADJ,6
30,SCONJ,PRON,5
15,ADJ,PROPN,5


2it [00:11,  5.79s/it]

____________________________ TURKISH CORPUS ____________________________
Size dataset :  (10000, 10)
Test Accuracy: 0.774
Test F1 Score: 0.765
Train Accuracy: 0.953
Train F1 Score: 0.953


Unnamed: 0,Tag,Accuracy
0,NOUN,0.898851
1,PRON,0.911765
2,VERB,0.64375
3,ADV,0.546667
4,DET,0.964912
5,ADJ,0.550725
6,ADP,0.916667
7,NUM,0.764706
8,CCONJ,0.9
9,PROPN,0.25


Unnamed: 0,Tag,Correct Predictions,Incorrect Predictions
0,NOUN,391,44
1,PRON,31,3
2,VERB,103,57
3,ADV,41,34
4,DET,55,2
5,ADJ,76,62
6,ADP,22,2
7,NUM,13,4
8,CCONJ,27,3
9,PROPN,5,15


Unnamed: 0,From Tag,To Tag,Frequency
10,VERB,NOUN,45
20,ADJ,NOUN,45
3,NOUN,ADJ,17
13,ADV,NOUN,16
33,PROPN,NOUN,14
1,NOUN,VERB,12
11,VERB,ADJ,11
21,ADJ,VERB,10
16,ADV,ADJ,8
2,NOUN,ADV,6


3it [00:14,  4.75s/it]
