In [1]:
import pandas as pd

from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import ExtraTreesClassifier

from functions import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
path_kz = 'data/kdt-NLANU-0.01.connlu.txt'
path_en = 'data/en_ewt-ud-dev.conllu'
#path_tu = 'data/tr_kenet-ud-dev.conllu'

path_data = [
    path_kz, 
    path_en, 
#    path_tu
    ]

languages = [
    'kazakh',
    'english',
#    'turkish'
    ]

dic_ = {}
for l in languages : 
    dic_[l] = {
            'test_acc' : '',
            'test_f1' : '',
            'train_acc' : '',
            "train_f1" : '',
            "Y" : '',
            "predicts" : '', 
            "list_tags" : ''
            }

In [20]:
columns = ["ID", "WORD", "LEMMA", "POS", "XPOS", "MORPH", "HEAD", "DEPREL", "DEPS", "MISC"]


df = pd.read_csv(path_kz,
                sep='\t',
                names=columns,
                skip_blank_lines=True
                )

# run the hand-made function to clean data
X_lex, Y_lex = clean_data(df)


# Split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X_lex, 
                                                    Y_lex, 
                                                    test_size=0.1, 
                                                    random_state=42
                                                    )

#get max word length
max_word_len = max(max([len(w) for w in Y_lex]), max([len(w) for w in X_lex]))

#Char2vec model
vectorizer = TfidfVectorizer(lowercase=False, 
                            analyzer='char'
                            )


X = vectorizer.fit_transform(X_lex)
dic = vectorizer.get_feature_names_out() # letter dictionary
num_letters = len(dic)
mx = X.T.dot(X) # letter cooccurence matrix
mx = mx.toarray()

#Vectorize X only
X_lex_vec_train = [alpha_vec2(w, mx, max_word_len, dic) for w in X_train]
X_lex_vec_test = [alpha_vec2(w, mx, max_word_len, dic) for w in X_test]

# Encode Y
list_tags = list_all_POS_tags(y = y_train)
encoder_tag = LabelEncoder().fit(list_tags)

Y_train = encoder_tag.transform(y_train)
Y_test = encoder_tag.transform(y_test)

# Build & train model
best_model = ExtraTreesClassifier(n_estimators=10,
                                n_jobs=-1,
                                criterion='entropy',
                                bootstrap=True
                                )

best_model.fit(X_lex_vec_train, Y_train)

# predict both train and test sets
predicts_test = best_model.predict(X_lex_vec_test)
predicts_train = best_model.predict(X_lex_vec_train)

test_acc, test_f1, train_acc, train_f1 = calculate_results(Y_test, 
                    Y_train, 
                    predicts_test, 
                    predicts_train
                    )

Size dataset :  (20000, 10)


In [23]:
df_ = df.head(n=1000)

In [78]:
import nltk
from nltk.tag import BigramTagger
from sklearn.metrics import classification_report
# Filter relevant columns (WORD and POS)
df_ = df_[['WORD', 'POS']]

# Remove rows where POS or WORD are NaN (or filter out irrelevant rows)
df_.dropna(subset=['WORD', 'POS'], inplace=True)


# Convert the data into the format that NLTK expects (list of tuples)
tagged_sentences = []
sentence = []

for _, row in df_.iterrows():
    if row['WORD'] == ".":  # End of a sentence (you may need to adjust this)
        sentence.append((row['WORD'], row['POS']))
        tagged_sentences.append(sentence)
        sentence = []
    else:
        sentence.append((row['WORD'], row['POS']))

# Handle any remaining sentence
if sentence:
    tagged_sentences.append(sentence)

# Split data into training and testing sets
train_data, test_data = train_test_split(tagged_sentences, test_size=0.1, random_state=42)

# Train the Unigram Tagger
BigramTagger = BigramTagger(train_data)

# Function to extract words and POS tags for classification report
def extract_tags(tagged_data):
    y_true = []
    y_pred = []
    for sentence in tagged_data:
        words, true_tags = zip(*sentence)  # separate words and tags
        predicted_tags = []
        
        # Predict tags, handling unknown tags
        for word in words:
            
            prediction = BigramTagger.tag([word])[0][1] if BigramTagger.tag([word]) else "UNK"
            predicted_tags.append(prediction)
        y_true.extend(true_tags)
        y_pred.extend(predicted_tags)
    
    return y_true, y_pred

# Extract true labels and predicted labels from the test data
Y_test, predicts_test = extract_tags(test_data)
Y_train, predicts_train = extract_tags(train_data)

"""
# Check if both y_true and y_pred contain unknown tags, and ensure consistency
print(f"Unique tags in y_true: {set(y_true)}")
print(f"Unique tags in y_pred: {set(y_pred)}")
"""

test_acc, test_f1, train_acc, train_f1 = calculate_results(Y_test, 
                    Y_train, 
                    predicts_test, 
                    predicts_train
                    )


ValueError: Classification metrics can't handle a mix of multiclass and unknown targets

In [82]:
print(f"Unique tags in y_true: {set(Y_test)}")
print(f"Unique tags in y_pred: {set(predicts_test)}")

Unique tags in y_true: {'PUNCT', 'PROPN', 'AUX', 'PRON', 'ADJ', 'NUM', 'CONJ', 'NOUN', 'VERB'}
Unique tags in y_pred: {None, 'PUNCT', 'PRON'}


In [81]:
train_f1 = f1_score(Y_test,
                        predicts_test,
                        average = "weighted"
                        )

ValueError: Classification metrics can't handle a mix of multiclass and unknown targets

In [76]:
train_data

[[('.', 'PUNCT')],
 [('Бәрі', 'PRON'),
  ('қарулы', 'ADJ'),
  (',', 'PUNCT'),
  ('көбінің', 'NOUN'),
  ('қолында', 'NOUN'),
  ('сойыл', 'NOUN')],
 [('“', 'PUNCT'),
  ('Аққу', 'NOUN'),
  ('”', 'PUNCT'),
  ('дегеніміз', 'VERB'),
  ('“', 'PUNCT'),
  ('қара', 'ADJ'),
  ('құзғын', 'NOUN'),
  ('”', 'PUNCT'),
  ('болып', 'VERB'),
  ('жүрмесін', 'AUX'),
  ('?', 'PUNCT'),
  ('!', 'PUNCT'),
  ('Құрал', 'NOUN'),
  ('өзгерді', 'VERB'),
  ('.', 'PUNCT')],
 [('Соңғы', 'ADJ'),
  ('бес', 'NUM'),
  ('жылда', 'NOUN'),
  ('сот', 'NOUN'),
  ('орындары', 'NOUN'),
  ('прокурорлардың', 'NOUN'),
  ('талабымен', 'NOUN'),
  ('мыңдаған', 'NUM'),
  ('интернет', 'NOUN'),
  ('сайттарының', 'NOUN'),
  ('материалдарын', 'NOUN'),
  ('заңсыз', 'ADJ'),
  ('деп', 'VERB'),
  ('танып', 'VERB'),
  (',', 'PUNCT'),
  ('оларды', 'PRON'),
  ('Қазақстан', 'PROPN'),
  ('аумағында', 'NOUN'),
  ('таратуға', 'VERB'),
  ('тыйым', 'NOUN'),
  ('салған', 'VERB'),
  ('.', 'PUNCT')],
 [('Сонымен', 'PRON'),
  ('қатар', 'ADP'),
  (',', 'PUN