## Packages

In [1]:
import pandas as pd

from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import ExtraTreesClassifier

from functions import *

import warnings
warnings.filterwarnings("ignore")

## Import & clean data

In [2]:
path_kz = 'data/kdt-NLANU-0.01.connlu.txt'
path_en = 'data/en_ewt-ud-dev.conllu'
path_tu = 'data/tr_kenet-ud-dev.conllu'

path_data = [
    path_kz, 
    path_en, 
    path_tu
    ]

languages = [
    'kazakh',
    'english',
    'turkish'
    ]

dic_ = {}
for l in languages : 
    dic_[l] = {
            'test_acc' : '',
            'test_f1' : '',
            'train_acc' : '',
            "train_f1" : '',
            "Y" : '',
            "predicts" : '', 
            "list_tags" : ''
            }

In [3]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

for path, lang in tqdm(zip(path_data, languages)) :

    print("____________________________" , lang.upper(), "CORPUS ____________________________")

    # Read the file and convert it to a DataFrame
    df = pd.read_csv(path,
                    sep='\t',
                    names=columns,
                    skip_blank_lines=True
                    )

    # run the hand-made function to clean data
    X_lex, Y_lex = clean_data(df)

    X_train, X_test, y_train, y_test = train_test_split(X_lex, 
                                                        Y_lex, 
                                                        test_size=0.1, 
                                                        random_state=42
                                                        )

    #get max word length
    max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

    #Char2vec model
    vectorizer = TfidfVectorizer(lowercase=False, 
                                analyzer='char'
                                )

    X = vectorizer.fit_transform(X_lex)
    dic = vectorizer.get_feature_names_out() # letter dictionary
    num_letters = len(dic)
    mx = X.T.dot(X) # letter cooccurence matrix
    mx = mx.toarray()

    #Vectorize X only
    X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
    X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

    # Encode Y
    list_tags = list_all_POS_tags(y = y_train)
    encoder_tag = LabelEncoder().fit(list_tags)

    Y_train = encoder_tag.transform(y_train)
    Y_test = encoder_tag.transform(y_test)

    # Build & train model
    best_model = ExtraTreesClassifier(n_estimators=10,
                                    n_jobs=-1,
                                    criterion='entropy',
                                    bootstrap=True
                                    )

    best_model.fit(X_lex_vec_train, Y_train)

    # predict both train and test sets
    predicts_test = best_model.predict(X_lex_vec_test)
    predicts_train = best_model.predict(X_lex_vec_train)
    
    test_acc, test_f1, train_acc, train_f1 = calculate_results(Y_test, 
                      Y_train, 
                      predicts_test, 
                      predicts_train
                      )
    print("Test Accuracy:", round(test_acc, 3))
    print("Test F1 Score:", round(test_f1, 3))
    print("Train Accuracy:", round(train_acc, 3))
    print("Train F1 Score:", round(train_f1, 3))


    fig = plot_confusion_matrix(Y_test, predicts_test, list_tags, 'Test set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_test')
    
    fig = plot_confusion_matrix(Y_train, predicts_train, list_tags, 'Train set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_train')


    df_tag_acc = per_tag_accuracy(Y_test, 
                              predicts_test, 
                              list_tags, 
                              encoder_tag
                              )

    display(df_tag_acc) # display accuracy per Tag
    
    df_tag_dist = tag_prediction_nb(
        Y_test, 
        predicts_test, 
        list_tags, 
        encoder_tag
        )

    display(df_tag_dist) # display the number of correct and incorect predictions for each tag


    fig = plot_dist_predictions(df_tag_dist,
                                lang)
    save_graph_to_folder(fig, lang, 'dist_predictions')
    
    
    mistake_freq_df = mistake_frequency_by_word_type(Y_test, 
                                                    predicts_test, 
                                                    list_tags, 
                                                    encoder_tag
                                                    )

    display(mistake_freq_df.head(n=10)) # Print 10 most frequent errors


0it [00:00, ?it/s]

____________________________ KAZAKH CORPUS ____________________________
Size dataset :  (1000, 10)
Test Accuracy: 0.59
Test F1 Score: 0.56
Train Accuracy: 0.982
Train F1 Score: 0.982


Unnamed: 0,Tag,Accuracy
0,NOUN,0.914286
1,ADP,0.8
2,ADV,0.2
3,ADJ,0.272727
4,VERB,0.52381
5,PROPN,0.272727
6,NUM,0.5
7,PRON,0.75
8,SCONJ,
9,AUX,0.0


Unnamed: 0,Tag,Correct Predictions,Incorrect Predictions
0,NOUN,32,3
1,ADP,4,1
2,ADV,1,4
3,ADJ,3,8
4,VERB,11,10
5,PROPN,3,8
6,NUM,2,2
7,PRON,3,1
8,SCONJ,0,0
9,AUX,0,4


Unnamed: 0,From Tag,To Tag,Frequency
9,VERB,NOUN,9
6,ADJ,NOUN,6
11,PROPN,NOUN,4
12,PROPN,ADJ,3
17,AUX,VERB,3
0,NOUN,VERB,2
3,ADV,NOUN,2
14,NUM,NOUN,2
1,NOUN,PROPN,1
2,ADP,NOUN,1


1it [00:05,  5.06s/it]

____________________________ ENGLISH CORPUS ____________________________
Size dataset :  (1000, 10)
Test Accuracy: 0.63
Test F1 Score: 0.611
Train Accuracy: 0.97
Train F1 Score: 0.97


Unnamed: 0,Tag,Accuracy
0,ADV,0.6
1,AUX,0.777778
2,VERB,0.4
3,ADP,1.0
4,NOUN,0.533333
5,PRON,1.0
6,DET,0.916667
7,CCONJ,1.0
8,PROPN,0.285714
9,ADJ,0.2


Unnamed: 0,Tag,Correct Predictions,Incorrect Predictions
0,ADV,3,2
1,AUX,7,2
2,VERB,4,6
3,ADP,11,0
4,NOUN,8,7
5,PRON,8,0
6,DET,11,1
7,CCONJ,3,0
8,PROPN,2,5
9,ADJ,3,12


Unnamed: 0,From Tag,To Tag,Frequency
14,ADJ,NOUN,8
12,PROPN,NOUN,5
13,ADJ,VERB,4
10,NOUN,ADJ,4
5,VERB,NOUN,3
4,VERB,AUX,2
15,SCONJ,ADP,2
0,ADV,VERB,1
7,NOUN,ADV,1
6,VERB,PRON,1


2it [00:06,  3.17s/it]

____________________________ TURKISH CORPUS ____________________________
Size dataset :  (1000, 10)
Test Accuracy: 0.68
Test F1 Score: 0.653
Train Accuracy: 0.964
Train F1 Score: 0.964


Unnamed: 0,Tag,Accuracy
0,NOUN,0.913043
1,PROPN,0.0
2,VERB,0.307692
3,ADJ,0.384615
4,ADP,1.0
5,AUX,0.0
6,DET,1.0
7,CCONJ,1.0
8,ADV,0.333333
9,PRON,0.5


Unnamed: 0,Tag,Correct Predictions,Incorrect Predictions
0,NOUN,42,4
1,PROPN,0,1
2,VERB,4,9
3,ADJ,5,8
4,ADP,2,0
5,AUX,0,1
6,DET,5,0
7,CCONJ,2,0
8,ADV,2,4
9,PRON,3,3


Unnamed: 0,From Tag,To Tag,Frequency
4,VERB,NOUN,7
6,ADJ,NOUN,7
9,ADV,NOUN,3
11,PRON,NOUN,3
1,NOUN,VERB,2
12,NUM,NOUN,2
5,VERB,ADJ,2
0,NOUN,PROPN,1
2,NOUN,ADJ,1
3,PROPN,NOUN,1


3it [00:08,  2.86s/it]
