In [1]:
import pandas as pd

import numpy as np

from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, classification_report


import nltk
from nltk.tag import DefaultTagger
from nltk.tag import BigramTagger

nltk.download("averaged_perceptron_tagger")

from functions import *

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Olivier\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
path_kz = 'data/kdt-NLANU-0.01.connlu.txt'
path_en = 'data/en_ewt-ud-dev.conllu'
#path_tu = 'data/tr_kenet-ud-dev.conllu'

path_data = [
    path_kz, 
    path_en, 
#    path_tu
    ]

languages = [
    'kazakh',
    'english',
#    'turkish'
    ]

dic_ = {}
for l in languages : 
    dic_[l] = {
            'test_acc' : '',
            'test_f1' : '',
            'train_acc' : '',
            "train_f1" : '',
            "Y" : '',
            "predicts" : '', 
            "list_tags" : ''
            }

In [None]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

for path, lang in tqdm(zip(path_data, languages)) :

    print("____________________________" , lang.upper(), "CORPUS ____________________________")

    # Read the file and convert it to a DataFrame
    df = pd.read_csv(path,
                    sep='\t',
                    names=columns,
                    skip_blank_lines=True
                    )

    # run the hand-made function to clean data
    X_lex, Y_lex = clean_data(df)
    
    # Split into train & test sets
    X_train, X_test, y_train, y_test = train_test_split(X_lex, 
                                                        Y_lex, 
                                                        test_size=0.1, 
                                                        random_state=42
                                                        )

    #get max word length
    max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

    #Char2vec model
    vectorizer = TfidfVectorizer(lowercase=False, 
                                analyzer='char'
                                )

    X = vectorizer.fit_transform(X_lex)
    dic = vectorizer.get_feature_names_out() # letter dictionary
    num_letters = len(dic)
    mx = X.T.dot(X) # letter cooccurence matrix
    mx = mx.toarray()

    #Vectorize X only
    X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
    X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

    # Encode Y
    list_tags = list_all_POS_tags(y = y_train)
    encoder_tag = LabelEncoder().fit(list_tags)

    Y_train = encoder_tag.transform(y_train)
    Y_test = encoder_tag.transform(y_test)

    # Build & train model
    best_model = ExtraTreesClassifier(n_estimators=10,
                                    n_jobs=-1,
                                    criterion='entropy',
                                    bootstrap=True
                                    )

    best_model.fit(X_lex_vec_train, Y_train)

    # predict both train and test sets
    predicts_test = best_model.predict(X_lex_vec_test)
    predicts_train = best_model.predict(X_lex_vec_train)

    test_acc, test_f1, train_acc, train_f1 = calculate_results(Y_test, 
                      Y_train, 
                      predicts_test, 
                      predicts_train
                      )
    print("Test Accuracy:", round(test_acc, 3))
    print("Test F1 Score:", round(test_f1, 3))
    print("Train Accuracy:", round(train_acc, 3))
    print("Train F1 Score:", round(train_f1, 3))

    fig = plot_confusion_matrix(Y_test, predicts_test, list_tags, 'Test set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_test')
    
    fig = plot_confusion_matrix(Y_train, predicts_train, list_tags, 'Train set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_train')

    df_tag_acc = per_tag_accuracy(Y_test, 
                              predicts_test, 
                              list_tags, 
                              encoder_tag
                              )

    display(df_tag_acc) # display accuracy per Tag
    
    df_tag_dist = tag_prediction_nb(
        Y_test, 
        predicts_test, 
        list_tags, 
        encoder_tag
        )

    display(df_tag_dist) # display the number of correct and incorect predictions for each tag


    fig = plot_dist_predictions(df_tag_dist,
                                lang)
    save_graph_to_folder(fig, lang, 'dist_predictions')
    
    
    mistake_freq_df = mistake_frequency_by_word_type(Y_test, 
                                                    predicts_test, 
                                                    list_tags, 
                                                    encoder_tag
                                                    )

    display(mistake_freq_df.head(n=10)) # Print 10 most frequent errors

In [None]:
"""
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]


df = pd.read_csv(path_kz,
                sep='\t',
                names=columns,
                skip_blank_lines=True
                )

# run the hand-made function to clean data
X_lex, Y_lex = clean_data(df)


# Split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X_lex, 
                                                    Y_lex, 
                                                    test_size=0.1, 
                                                    random_state=42
                                                    )

#get max word length
max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

#Char2vec model
vectorizer = TfidfVectorizer(lowercase=False, 
                            analyzer='char'
                            )


X = vectorizer.fit_transform(X_lex)
dic = vectorizer.get_feature_names_out() # letter dictionary
num_letters = len(dic)
mx = X.T.dot(X) # letter cooccurence matrix
mx = mx.toarray()

#Vectorize X only
X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

# Encode Y
list_tags = list_all_POS_tags(y = y_train)
encoder_tag = LabelEncoder().fit(list_tags)

Y_train = encoder_tag.transform(y_train)
Y_test = encoder_tag.transform(y_test)

# Build & train model
best_model = ExtraTreesClassifier(n_estimators=10,
                                n_jobs=-1,
                                criterion='entropy',
                                bootstrap=True
                                )

best_model.fit(X_lex_vec_train, Y_train)

# predict both train and test sets
predicts_test = best_model.predict(X_lex_vec_test)
predicts_train = best_model.predict(X_lex_vec_train)

test_acc, test_f1, train_acc, train_f1 = calculate_results(Y_test, 
                    Y_train, 
                    predicts_test, 
                    predicts_train
                    )"""

In [None]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

df = pd.read_csv(path_en,
                sep='\t',
                names=columns,
                skip_blank_lines=True
                )

df_=df.head(n=300000)

In [5]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

for path, lang in tqdm(zip(path_data, languages)) :

    print("____________________________" , lang.upper(), "CORPUS ____________________________")

    # Read the file and convert it to a DataFrame
    df = pd.read_csv(path,
                    sep='\t',
                    names=columns,
                    skip_blank_lines=True
                    )

    # run the hand-made function to clean data
    df = clean_data(df)

    # data for home made tagger
    X_lex, Y_lex = get_values(df)

    # data for nltk tagger
    

    
    # Split into train & test sets
    X_train, X_test, y_train, y_test = train_test_split(X_lex, 
                                                        Y_lex, 
                                                        test_size=0.1, 
                                                        random_state=42
                                                        )

    #get max word length
    max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

    #Char2vec model
    vectorizer = TfidfVectorizer(lowercase=False, 
                                analyzer='char'
                                )

    X = vectorizer.fit_transform(X_lex)
    dic = vectorizer.get_feature_names_out() # letter dictionary
    num_letters = len(dic)
    mx = X.T.dot(X) # letter cooccurence matrix
    mx = mx.toarray()

    #Vectorize X only
    X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
    X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

    # Encode Y
    list_tags = list_all_POS_tags(y = y_train)
    encoder_tag = LabelEncoder().fit(list_tags)

    Y_train = encoder_tag.transform(y_train)
    Y_test = encoder_tag.transform(y_test)

    # Build & train model
    best_model = ExtraTreesClassifier(n_estimators=10,
                                    n_jobs=-1,
                                    criterion='entropy',
                                    bootstrap=True
                                    )

    best_model.fit(X_lex_vec_train, Y_train)

    # predict both train and test sets
    predicts_test = best_model.predict(X_lex_vec_test)
    predicts_train = best_model.predict(X_lex_vec_train)

    test_acc, test_f1, test_recall, train_acc, train_f1, train_recall = calculate_results(Y_test, 
                                                                                            Y_train, 
                                                                                            predicts_test, 
                                                                                            predicts_train
                                                                                            )

    print("Test Accuracy:", round(test_acc, 3))
    print("Test F1 Score:", round(test_f1, 3))
    print("Test recall:", round(test_recall, 3))
    print("Train Accuracy:", round(train_acc, 3))
    print("Train F1 Score:", round(train_f1, 3))
    print("Train recall:", round(train_recall, 3))

    fig = plot_confusion_matrix(Y_test, predicts_test, list_tags, 'Test set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_test')
    
    fig = plot_confusion_matrix(Y_train, predicts_train, list_tags, 'Train set', lang)
    save_graph_to_folder(fig, lang, 'confusion_matrix_train')

    df_tag_acc = per_tag_accuracy(Y_test, 
                              predicts_test, 
                              list_tags, 
                              encoder_tag
                              )

    display(df_tag_acc) # display accuracy per Tag
    
    df_tag_dist = tag_prediction_nb(
        Y_test, 
        predicts_test, 
        list_tags, 
        encoder_tag
        )

    display(df_tag_dist) # display the number of correct and incorect predictions for each tag


    fig = plot_dist_predictions(df_tag_dist,
                                lang)
    save_graph_to_folder(fig, lang, 'dist_predictions')
    
    
    mistake_freq_df = mistake_frequency_by_word_type(Y_test, 
                                                    predicts_test, 
                                                    list_tags, 
                                                    encoder_tag
                                                    )

    display(mistake_freq_df.head(n=10)) # Print 10 most frequent errors

0it [00:00, ?it/s]

____________________________ KAZAKH CORPUS ____________________________
Size dataset :  (20000, 10)
Test Accuracy: 0.854
Test F1 Score: 0.851
Test recall: 0.854
Train Accuracy: 0.968
Train F1 Score: 0.968
Train recall: 0.968


Unnamed: 0,Tag,Accuracy
0,VERB,0.753799
1,NOUN,0.904918
2,PUNCT,1.0
3,ADV,0.559322
4,ADJ,0.66242
5,PROPN,0.865385
6,NUM,0.82
7,PRON,0.896552
8,ADP,0.979798
9,AUX,0.528571


Unnamed: 0,Tag,Correct Predictions,Incorrect Predictions
0,VERB,248,81
1,NOUN,552,58
2,PUNCT,372,0
3,ADV,33,26
4,ADJ,104,53
5,PROPN,135,21
6,NUM,41,9
7,PRON,78,9
8,ADP,97,2
9,AUX,37,33


Unnamed: 0,From Tag,To Tag,Frequency
0,VERB,NOUN,53
21,ADJ,NOUN,32
6,NOUN,VERB,24
4,VERB,AUX,21
40,AUX,VERB,20
28,PROPN,NOUN,15
8,NOUN,ADJ,13
16,ADV,NOUN,11
42,AUX,ADJ,11
9,NOUN,PROPN,9


1it [00:09,  9.76s/it]

____________________________ ENGLISH CORPUS ____________________________
Size dataset :  (20000, 10)
Test Accuracy: 0.871
Test F1 Score: 0.87
Test recall: 0.871
Train Accuracy: 0.949
Train F1 Score: 0.948
Train recall: 0.949


Unnamed: 0,Tag,Accuracy
0,DET,0.962733
1,SCONJ,0.652174
2,CCONJ,0.980392
3,NOUN,0.871642
4,ADJ,0.717557
5,VERB,0.786008
6,PRON,0.917197
7,ADV,0.641304
8,PROPN,0.778443
9,AUX,0.926471


Unnamed: 0,Tag,Correct Predictions,Incorrect Predictions
0,DET,155,6
1,SCONJ,30,16
2,CCONJ,50,1
3,NOUN,292,43
4,ADJ,94,37
5,VERB,191,52
6,PRON,144,13
7,ADV,59,33
8,PROPN,130,37
9,AUX,126,10


Unnamed: 0,From Tag,To Tag,Frequency
16,VERB,NOUN,34
34,PROPN,NOUN,25
8,NOUN,VERB,21
5,SCONJ,ADP,14
12,ADJ,NOUN,14
28,ADV,ADJ,12
13,ADJ,VERB,11
10,NOUN,PROPN,11
19,VERB,AUX,10
31,ADV,ADP,10


2it [00:24, 12.01s/it]


In [None]:
"""
# Check if both y_true and y_pred contain unknown tags, and ensure consistency
print(f"Unique tags in y_true: {set(y_true)}")
print(f"Unique tags in y_pred: {set(y_pred)}")
"""

test_acc, test_f1, test_recall, train_acc, train_f1, train_recall = calculate_results(Y_test, 
                                                                                        Y_train, 
                                                                                        predicts_test, 
                                                                                        predicts_train
                                                                                        )

print("Test Accuracy:", round(test_acc, 3))
print("Test F1 Score:", round(test_f1, 3))
print("Test recall:", round(test_recall, 3))
print("Train Accuracy:", round(train_acc, 3))
print("Train F1 Score:", round(train_f1, 3))
print("Train recall:", round(train_recall, 3))

In [None]:
def data_to_nltk(df):
    # Convert the data into the format that NLTK expects (list of tuples)
    tagged_sentences = []
    sentence = []

    for _, row in df.iterrows():
        if row['WORD'] == ".":  # End of a sentence (you may need to adjust this)
            sentence.append((row['WORD'], row['POS']))
            tagged_sentences.append(sentence)
            sentence = []
        else:
            sentence.append((row['WORD'], row['POS']))

    # Handle any remaining sentence
    if sentence:
        tagged_sentences.append(sentence)
    return tagged_sentences

def extract_words_and_tags(nested_list):
    # Flatten the nested list of tuples
    words = [word for sentence in nested_list for word, _ in sentence]
    tags = [tag for sentence in nested_list for _, tag in sentence]
    
    # Convert the lists to numpy arrays
    words_array = np.array(words, dtype=object)
    tags_array = np.array(tags, dtype=object)
    
    return words_array, tags_array

# Function to extract words and POS tags for classification report
def extract_tags(tagged_data, tagger):
    y_true = []
    y_pred = []
    for sentence in tagged_data:
        words, true_tags = zip(*sentence)  # separate words and tags
        predicted_tags = []
        
        # Predict tags, handling unknown tags
        for word in words:
            
            prediction = tagger.tag([word])[0][1] if tagger.tag([word]) else "UNK"
            predicted_tags.append(prediction)
        y_true.extend(true_tags)
        y_pred.extend(predicted_tags)
    
    return y_true, y_pred

'def nltk_model(\n        train_data, \n        test_data\n        ) : \n    DFTagger = DefaultTagger("NN")\n    # Train the Unigram Tagger\n    BigramTagger = BigramTagger(train_data, \n                                backoff=DFTagger)\n\n    # Extract true labels and predicted labels from the test data\n    Y_test, predicts_test = extract_tags(test_data)\n    Y_train, predicts_train = extract_tags(train_data)\n\n    return Y_test, predicts_test, Y_train, predicts_train'

In [None]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]

# Read the file and convert it to a DataFrame
df = pd.read_csv(path_en,
                sep='\t',
                names=columns,
                skip_blank_lines=True
                )

# run the hand-made function to clean data
df = clean_data(df)

tagged_sentences = data_to_nltk(df)

X_lex, Y_lex = get_values(df)

# Split data into training and testing sets
train_data, test_data = train_test_split(tagged_sentences, 
                                         test_size=0.1, 
                                         random_state=42
                                         )


################## NLTK MODEL ##################
# setup and train BigramTagger
DFTagger = DefaultTagger("NN")
Tagger = BigramTagger(train_data, 
                    backoff=DFTagger)

# Extract true labels and predicted labels from the test data
Y_test_nltk, predicts_test_nltk = extract_tags(test_data, Tagger)
Y_train_nltk, predicts_train_nltk = extract_tags(train_data, Tagger)
################################################



X_test, y_test = extract_words_and_tags(test_data)
X_train, y_train = extract_words_and_tags(train_data)

#get max word length
max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

#Char2vec model
vectorizer = TfidfVectorizer(lowercase=False, 
                            analyzer='char'
                            )

X = vectorizer.fit_transform(X_lex)
dic = vectorizer.get_feature_names_out() # letter dictionary
num_letters = len(dic)
mx = X.T.dot(X) # letter cooccurence matrix
mx = mx.toarray()

#Vectorize X only
X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

# Encode Y
list_tags = list_all_POS_tags(y = y_train)
encoder_tag = LabelEncoder().fit(list_tags)

Y_train = encoder_tag.transform(y_train)
Y_test = encoder_tag.transform(y_test)

# Build & train model
best_model = ExtraTreesClassifier(n_estimators=10,
                                n_jobs=-1,
                                criterion='entropy',
                                bootstrap=True
                                )

best_model.fit(X_lex_vec_train, Y_train)

# predict both train and test sets
predicts_test = best_model.predict(X_lex_vec_test)
predicts_train = best_model.predict(X_lex_vec_train)

Size dataset :  (20000, 10)


In [10]:
test_acc, test_f1, test_recall, train_acc, train_f1, train_recall = calculate_results(Y_test, 
                                                                                        Y_train, 
                                                                                        predicts_test, 
                                                                                        predicts_train
                                                                                        )

print("Test Accuracy:", round(test_acc, 3))
print("Test F1 Score:", round(test_f1, 3))
print("Test recall:", round(test_recall, 3))
print("Train Accuracy:", round(train_acc, 3))
print("Train F1 Score:", round(train_f1, 3))
print("Train recall:", round(train_recall, 3))

Test Accuracy: 0.842
Test F1 Score: 0.837
Test recall: 0.842
Train Accuracy: 0.95
Train F1 Score: 0.948
Train recall: 0.95


In [11]:
test_acc, test_f1, test_recall, train_acc, train_f1, train_recall = calculate_results(Y_test_nltk, 
                                                                                        Y_train_nltk, 
                                                                                        predicts_test_nltk, 
                                                                                        predicts_train_nltk
                                                                                        )

print("Test Accuracy:", round(test_acc, 3))
print("Test F1 Score:", round(test_f1, 3))
print("Test recall:", round(test_recall, 3))
print("Train Accuracy:", round(train_acc, 3))
print("Train F1 Score:", round(train_f1, 3))
print("Train recall:", round(train_recall, 3))

Test Accuracy: 0.288
Test F1 Score: 0.368
Test recall: 0.288
Train Accuracy: 0.308
Train F1 Score: 0.391
Train recall: 0.308
