## Packages

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Benchmark Model
import nltk
from nltk.tag import DefaultTagger
from nltk.tag import BigramTagger
nltk.download("averaged_perceptron_tagger")

# Home-made functions
#from functions import *
from fn_feature import *
from fn_nltk import *
from fn_results import *

import warnings
warnings.filterwarnings("ignore")

## Import & clean data

In [None]:
path_kz = 'data/kdt-NLANU-0.01.connlu.txt'
path_en = 'data/en_ewt-ud-dev.conllu'
#path_tu = 'data/tr_kenet-ud-dev.conllu'

path_data = [
    path_kz, 
    path_en, 
#    path_tu
    ]

languages = [
    'kazakh',
    'english',
#    'turkish'
    ]

In [None]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

for path, lang in tqdm(zip(path_data, languages)) :

    print("____________________________" , lang.upper(), "CORPUS ____________________________")


    # Read the file and convert it to a DataFrame
    df = pd.read_csv(path,
                    sep='\t',
                    names=columns,
                    skip_blank_lines=True
                    )

    # run the hand-made function to clean data
    df = clean_data(df)

    tagged_sentences = data_to_nltk(df)

    X_lex, Y_lex = get_values(df)

    # Split data into training and testing sets
    train_data, test_data = train_test_split(tagged_sentences, 
                                            test_size=0.1, 
                                            random_state=42
                                            )


    ################## NLTK MODEL ##################
    # setup and train BigramTagger
    DFTagger = DefaultTagger("NN")
    Tagger = BigramTagger(train_data, 
                        backoff=DFTagger)

    # Extract true labels and predicted labels from the test data
    Y_test_nltk, predicts_test_nltk = extract_tags(test_data, Tagger)
    Y_train_nltk, predicts_train_nltk = extract_tags(train_data, Tagger)
    ################################################





    X_test, y_test = extract_words_and_tags(test_data)
    X_train, y_train = extract_words_and_tags(train_data)

    #get max word length
    max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

    #Char2vec model
    vectorizer = TfidfVectorizer(lowercase=False, 
                                analyzer='char'
                                )

    X = vectorizer.fit_transform(X_lex)
    dic = vectorizer.get_feature_names_out() # letter dictionary
    num_letters = len(dic)
    mx = X.T.dot(X) # letter cooccurence matrix
    mx = mx.toarray()

    #Vectorize X only
    X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
    X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

    # Encode Y
    list_tags = list_all_POS_tags(y = y_train)
    encoder_tag = LabelEncoder().fit(list_tags)

    Y_train = encoder_tag.transform(y_train)
    Y_test = encoder_tag.transform(y_test)

    # Build & train model
    best_model = ExtraTreesClassifier(n_estimators=10,
                                    n_jobs=-1,
                                    criterion='entropy',
                                    bootstrap=True
                                    )

    best_model.fit(X_lex_vec_train, Y_train)

    # predict both train and test sets
    predicts_test = best_model.predict(X_lex_vec_test)
    predicts_train = best_model.predict(X_lex_vec_train)

    
    #####################################################################################################
    ########################################## result analysis ##########################################
    #####################################################################################################
    test_acc, test_f1, test_recall, train_acc, train_f1, train_recall = calculate_results(Y_test, 
                                                                                            Y_train, 
                                                                                            predicts_test, 
                                                                                            predicts_train
                                                                                            )
    test_acc_nltk, test_f1_nltk, test_recall_nltk, train_acc_nltk, train_f1_nltk, train_recall_nltk = calculate_results(Y_test_nltk, 
                                                                                            Y_train_nltk, 
                                                                                            predicts_test_nltk, 
                                                                                            predicts_train_nltk
                                                                                            )

    data = {
        "Metric": ["Test Accuracy", "Test F1 Score", "Test Recall", 
                "Train Accuracy", "Train F1 Score", "Train Recall"],
        "Multi-language POS Tagger": [test_acc, test_f1, test_recall,
                        train_acc, train_f1, train_recall],
        "NLTK POS Tagger": [test_acc_nltk, test_f1_nltk, test_recall_nltk,
                    train_acc_nltk, train_f1_nltk, train_recall_nltk]
    }

    # Create the DataFrame
    df_results = pd.DataFrame(data)

    # Display the DataFrame
    display(df_results)

    
    fig = plot_confusion_matrix(Y_test, predicts_test, list_tags, 'Test set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_test')
    
    fig = plot_confusion_matrix(Y_train, predicts_train, list_tags, 'Train set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_train')

    df_tag_acc = per_tag_accuracy(Y_test, 
                              predicts_test, 
                              list_tags, 
                              encoder_tag
                              )

    display(df_tag_acc) # display accuracy per Tag
    
    df_tag_dist = tag_prediction_nb(
        Y_test, 
        predicts_test, 
        list_tags, 
        encoder_tag
        )

    display(df_tag_dist) # display the number of correct and incorect predictions for each tag


    fig = plot_dist_predictions(df_tag_dist,
                                lang)
    save_graph_to_folder(fig, lang, 'dist_predictions')
    
    
    mistake_freq_df = mistake_frequency_by_word_type(Y_test, 
                                                    predicts_test, 
                                                    list_tags, 
                                                    encoder_tag
                                                    )

    display(mistake_freq_df.head(n=10)) # Print 10 most frequent errors