## Packages

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, classification_report


# Benchmark Model
import nltk
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk import word_tokenize
from nltk.corpus.reader import TaggedCorpusReader
from nltk.metrics import precision, recall, f_measure

nltk.download('punkt')

# Home-made functions
#from functions import *
from fn_feature import *
from fn_nltk import *
from fn_results import *
from nltk_func import *
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Import & clean data

In [3]:
path_kz = 'data/kdt-NLANU-0.01.connlu.txt'
path_en = 'data/en_ewt-ud-dev.conllu'

path_data = [
    path_kz, 
    path_en
    ]

languages = [
    'kazakh',
    'english'
    ]

In [11]:
def compute_metrics(train_data, test_data):

    # Train taggers with backoff
    unigram_tagger = UnigramTagger(train_data)
    bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
    trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)

    # Helper function to calculate metrics
    def calculate_metrics(tagger, data):
        y_true = [pos for sentence in data for _, pos in sentence]
        y_pred = [tag for sentence in data for _, tag in tagger.tag([word for word, _ in sentence])]
        return {
            'Precision': precision(set(y_true), set(y_pred)),
            'Recall': recall(set(y_true), set(y_pred)),
            'F1-Score': f_measure(set(y_true), set(y_pred)),
        }

    # Calculate metrics for train and test data for each tagger
    metrics = {
    'Metric': [
        'Train Precision', 'Test Precision', 'Train Recall', 'Test Recall', 'Train F1-Score', 'Test F1-Score'
    ],
    'Unigram': [
        calculate_metrics(unigram_tagger, train_data)['Precision'],
        calculate_metrics(unigram_tagger, test_data)['Precision'],
        calculate_metrics(unigram_tagger, train_data)['Recall'],
        calculate_metrics(unigram_tagger, test_data)['Recall'],
        calculate_metrics(unigram_tagger, train_data)['F1-Score'],
        calculate_metrics(unigram_tagger, test_data)['F1-Score']
    ],
    'Bigram': [
        calculate_metrics(bigram_tagger, train_data)['Precision'],
        calculate_metrics(bigram_tagger, test_data)['Precision'],
        calculate_metrics(bigram_tagger, train_data)['Recall'],
        calculate_metrics(bigram_tagger, test_data)['Recall'],
        calculate_metrics(bigram_tagger, train_data)['F1-Score'],
        calculate_metrics(bigram_tagger, test_data)['F1-Score']
    ],
    'Trigram': [
        calculate_metrics(trigram_tagger, train_data)['Precision'],
        calculate_metrics(trigram_tagger, test_data)['Precision'],
        calculate_metrics(trigram_tagger, train_data)['Recall'],
        calculate_metrics(trigram_tagger, test_data)['Recall'],
        calculate_metrics(trigram_tagger, train_data)['F1-Score'],
        calculate_metrics(trigram_tagger, test_data)['F1-Score']
    ]
    }

    # Create a DataFrame to store the results
    metrics_df = pd.DataFrame(metrics)
    return metrics_df

In [13]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

for path, lang in tqdm(zip(path_data, languages)) :

    print("____________________________" , lang.upper(), "CORPUS ____________________________")

    kazakh_sentences = parse_conllu(path)


    cutoff = int(.75 * len(kazakh_sentences))
    training_sentences = kazakh_sentences[:cutoff]
    test_sentences = kazakh_sentences[cutoff:]

    X_train, y_train = transform_to_dataset(training_sentences)
    X_test, y_test = transform_to_dataset(test_sentences)

    ################################################
    ################## NLTK MODEL ##################
    ################################################
    
    train_data = []
    sent = []
    for w, t in zip(X_train, y_train):
        if not w['is_last']:
            sent.append((w['word'], t))
        else:
            sent.append((w['word'], t))
            train_data.append(sent)
            sent = []

    test_data = []
    sent = []
    for w, t in zip(X_test, y_test):
        if not w['is_last']:
            sent.append((w['word'], t))
        else:
            sent.append((w['word'], t))
            test_data.append(sent)
            sent = []


    y_test = [pos for sentence in test_data for _, pos in sentence]

    unigram_tagger = UnigramTagger(train_data)
    print("Unigram Tagger Accuracy:", unigram_tagger.evaluate(test_data)) 

    bigram_tagger = BigramTagger(train_data)
    print("Bigram Tagger Accuracy:", bigram_tagger.evaluate(test_data)) 

    trigram_tagger = TrigramTagger(train_data)
    print("Tigram Tagger Accuracy:", trigram_tagger.evaluate(test_data)) 

    metrics_nltk = compute_metrics(train_data, test_data)

    display(metrics_nltk)




    #####################################################
    ################ multi lingual model ################
    #####################################################
    X_lex, Y_lex = get_values(kazakh_sentences)

    X_test, y_test = extract_words_and_tags(test_data)
    X_train, y_train = extract_words_and_tags(train_data)

    #get max word length
    max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

    #Char2vec model
    vectorizer = TfidfVectorizer(lowercase=False, 
                                analyzer='char'
                                )

    X = vectorizer.fit_transform(X_lex)
    dic = vectorizer.get_feature_names_out() # letter dictionary
    num_letters = len(dic)
    mx = X.T.dot(X) # letter cooccurence matrix
    mx = mx.toarray()

    #Vectorize X only
    X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
    X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

    # Encode Y
    list_tags = list_all_POS_tags(y = y_train)
    encoder_tag = LabelEncoder().fit(list_tags)

    Y_train = encoder_tag.transform(y_train)
    Y_test = encoder_tag.transform(y_test)

    # Build & train model
    best_model = ExtraTreesClassifier(n_estimators=10,
                                    n_jobs=-1,
                                    criterion='entropy',
                                    bootstrap=True
                                    )

    best_model.fit(X_lex_vec_train, Y_train)

    # predict both train and test sets
    predicts_test = best_model.predict(X_lex_vec_test)
    predicts_train = best_model.predict(X_lex_vec_train)

    
    #####################################################################################################
    ########################################## result analysis ##########################################
    #####################################################################################################
    test_acc, test_f1, test_recall, train_acc, train_f1, train_recall = calculate_results(Y_test, 
                                                                                            Y_train, 
                                                                                            predicts_test, 
                                                                                            predicts_train
                                                                                            )
    


    data = {
        "Metric": ["Test Accuracy", "Test F1 Score", "Test Recall", 
                "Train Accuracy", "Train F1 Score", "Train Recall"],
        "Multi-language POS Tagger": [test_acc, test_f1, test_recall,
                        train_acc, train_f1, train_recall]
    }

    # Create the DataFrame
    df_results = pd.DataFrame(data)

    # Display the DataFrame
    display(df_results)

    '''
    fig = plot_confusion_matrix(Y_test, predicts_test, list_tags, 'Test set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_test')
    
    fig = plot_confusion_matrix(Y_train, predicts_train, list_tags, 'Train set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_train')

    df_tag_acc = per_tag_accuracy(Y_test, 
                              predicts_test, 
                              list_tags, 
                              encoder_tag
                              )

    display(df_tag_acc) # display accuracy per Tag
    
    df_tag_dist = tag_prediction_nb(
        Y_test, 
        predicts_test, 
        list_tags, 
        encoder_tag
        )

    display(df_tag_dist) # display the number of correct and incorect predictions for each tag


    fig = plot_dist_predictions(df_tag_dist,
                                lang)
    save_graph_to_folder(fig, lang, 'dist_predictions')
    
    
    mistake_freq_df = mistake_frequency_by_word_type(Y_test, 
                                                    predicts_test, 
                                                    list_tags, 
                                                    encoder_tag
                                                    )

    display(mistake_freq_df.head(n=10)) # Print 10 most frequent errors'''


0it [00:00, ?it/s]

____________________________ KAZAKH CORPUS ____________________________
Unigram Tagger Accuracy: 0.9416345259822997
Bigram Tagger Accuracy: 0.809618099796825
Tigram Tagger Accuracy: 0.7593193636144495


Unnamed: 0,Metric,Unigram,Bigram,Trigram
0,Train Precision,1.0,1.0,1.0
1,Test Precision,0.941176,0.944444,0.944444
2,Train Recall,0.941176,1.0,1.0
3,Test Recall,0.941176,1.0,1.0
4,Train F1-Score,0.969697,1.0,1.0
5,Test F1-Score,0.941176,0.971429,0.971429


Unnamed: 0,Metric,Multi-language POS Tagger
0,Test Accuracy,0.951953
1,Test F1 Score,0.951309
2,Test Recall,0.951953
3,Train Accuracy,0.957085
4,Train F1 Score,0.956441
5,Train Recall,0.957085


1it [07:48, 468.96s/it]

____________________________ ENGLISH CORPUS ____________________________
Unigram Tagger Accuracy: 0.7336946003483646
Bigram Tagger Accuracy: 0.14631314108767177
Tigram Tagger Accuracy: 0.10354170698664603


Unnamed: 0,Metric,Unigram,Bigram,Trigram
0,Train Precision,1.0,1.0,1.0
1,Test Precision,0.947368,0.947368,0.947368
2,Train Recall,1.0,1.0,1.0
3,Test Recall,1.0,1.0,1.0
4,Train F1-Score,1.0,1.0,1.0
5,Test F1-Score,0.972973,0.972973,0.972973


Unnamed: 0,Metric,Multi-language POS Tagger
0,Test Accuracy,0.825431
1,Test F1 Score,0.826527
2,Test Recall,0.825431
3,Train Accuracy,0.936741
4,Train F1 Score,0.936978
5,Train Recall,0.936741


2it [08:13, 246.75s/it]
