In [1]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from commons import (
    DATASET_CLEAN_LOCATION,
    DATASET_CLEAN_UNDERSAMPLING_LOCATION,
    MODEL_FOLDER,
    VECTORIZERS_FOLDER,
    Datasets,
    vectorize_and_split_dataset,
)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Train Model

Now, we load the datasets that were exported during the data cleaning phase.


In [2]:
df = pd.read_csv(DATASET_CLEAN_LOCATION)
df_undersampling = pd.read_csv(DATASET_CLEAN_UNDERSAMPLING_LOCATION)

I use a utility function from the `commons.py` file, created to avoid duplicating code across notebooks. This function vectorizes the input text using the provided vectorizer and then splits the dataset into training and test sets.

Specifically:

- The function takes a DataFrame containing text and language labels, and a vectorizer (e.g., `CountVectorizer`).
- It transforms the text data into feature vectors with the vectorizer.
- The labels are encoded numerically.
- The dataset is split into training and test subsets with stratification to maintain class distribution.
- The function returns a `Datasets` object that holds the training and test feature matrices, labels, and the original text samples for both sets.

The `Datasets` class is a simple container to keep all these components organized and accessible.



In [3]:
vectorizer_bow = CountVectorizer()
vectorizer_bow_und = CountVectorizer()
vectorizer_tfidf = TfidfVectorizer()
vectorizer_tfidf_und = TfidfVectorizer()

df_bow = vectorize_and_split_dataset(df, vectorizer_bow)
df_bow_undersampling = vectorize_and_split_dataset(df_undersampling, vectorizer_bow_und)
df_tfidf = vectorize_and_split_dataset(df, vectorizer_tfidf)
df_tfidf_undersampling = vectorize_and_split_dataset(df_undersampling, vectorizer_tfidf_und)

I have chosen **Multinomial Naive Bayes (MNB)** and **Logistic Regression (LR)** as our classification algorithms based on insights from the following research articles:

- [Language Identification Using Multinomial Naive Bayes Technique](https://www.researchgate.net/publication/377067809_Language_Identification_Using_Multinomial_Naive_Bayes_Technique)
- [Language Identification Using Combination of Machine Learning Algorithms and Vectorization Techniques](https://www.researchgate.net/publication/362096783_Language_Identification_Using_Combination_of_Machine_Learning_Algorithms_and_Vectorization_Techniques)

**Reason for choosing these models:**

- **Multinomial Naive Bayes:**  
  This model is widely used in text classification tasks due to its simplicity, efficiency, and strong performance, especially when features represent term frequencies. The first article highlights how MNB effectively captures the distribution of words in different languages, making it a natural fit for language identification.

- **Logistic Regression:**  
  Logistic Regression is a robust, interpretable linear model that often performs well on binary and multiclass classification problems. According to the second article, combining logistic regression with appropriate vectorization techniques can improve classification accuracy.


In [4]:
def naive_bayes(datasets: Datasets) -> MultinomialNB:
    """
    Trains and evaluates a Naive Bayes classifier on the provided dataset.

    This function fits a Multinomial Naive Bayes model using the training data,
    evaluates its accuracy on the test set, and prints the accuracy score as well
    as a summary of misclassified examples including the original text, true label,
    and predicted label.

    Args:
        datasets (Datasets): A Datasets object containing:
            - X (features for training)
            - y (labels for training)
            - X_t (features for testing)
            - y_t (labels for testing)
            - text_t (original text corresponding to test samples)

    Returns:
        MultinomialNB: The trained Naive Bayes classifier.

    """
    nb = MultinomialNB()
    nb.fit(datasets.X, datasets.y)
    y_pred = nb.predict(datasets.X_t)
    print("Naive Bayes:", accuracy_score(datasets.y_t, y_pred))

    wrong_idx = datasets.y_t != y_pred
    errors_df = pd.DataFrame({
        "Text": datasets.text_t[wrong_idx],
        "True Label": datasets.y_t[wrong_idx],
        "Predicted Label": y_pred[wrong_idx],
    })
    for _, row in errors_df.iterrows():
        print(f"📝 Text: {row['Text']}")
        print(f"✅ True Label: {row['True Label']}")
        print(f"❌ Predicted: {row['Predicted Label']}")
        print("-" * 50)
    return nb

In [5]:
def logistic_regression(datasets: Datasets) -> GridSearchCV:
    """
    Trains and evaluates a logistic regression classifier using grid search with cross-validation.

    This function performs hyperparameter tuning on a logistic regression model using a predefined
    parameter grid and 5-fold cross-validation. It trains the model on the provided training data,
    evaluates accuracy on the test set, and prints the best parameters, cross-validation accuracy,
    and misclassified examples.

    Args:
        datasets (Datasets): A Datasets object containing:
            - X (features for training)
            - y (labels for training)
            - X_t (features for testing)
            - y_t (labels for testing)
            - text_t (original text corresponding to test samples)

    Returns:
        GridSearchCV: The fitted GridSearchCV object containing the best estimator.

    """
    param_grid = {
        "C": [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000],
        "penalty": ["l2"],
        "solver": ["lbfgs"],
    }
    grid = GridSearchCV(LogisticRegression(max_iter=1000, class_weight="balanced"), param_grid, cv=5, scoring="accuracy")
    grid.fit(datasets.X, datasets.y)

    print("Best Params:", grid.best_params_)
    print("Best CV Accuracy:", grid.best_score_)

    y_pred = grid.predict(datasets.X_t)
    print("Logistic Regression:", accuracy_score(datasets.y_t, y_pred))

    wrong_idx = datasets.y_t != y_pred
    errors_df = pd.DataFrame({
        "Text": datasets.text_t[wrong_idx],
        "True Label": datasets.y_t[wrong_idx],
        "Predicted Label": y_pred[wrong_idx],
    })
    for _, row in errors_df.iterrows():
        print(f"📝 Text: {row['Text']}")
        print(f"✅ True Label: {row['True Label']}")
        print(f"❌ Predicted: {row['Predicted Label']}")
        print("-" * 50)

    return grid

We try the Multinomial Naive Bayes and observe that the inputs with undersampling perform better for Bag of Words vectorizations.


In [6]:
nb_bow = naive_bayes(df_bow)

Naive Bayes: 0.965166908563135
📝 Text: ഇഗലഷ വകകപഡയയൽ പലപപഴ ഭരപകഷ ആളകളട അഭപരയ സതയ എനന രപതതൽ അടചചൽപപകകൻ സദധയതയണടവറണട ഉദഹരണതതന കശമർ പരശന ഇതൽ പകസതൻ വശജരകകള കടതൽ വകകപഡയ ഉപയകതകകൾ ഇനതയയൽ നനന ഉളളവരയതനൽ ലഖനതതന ഇനതയ അനകല ചയവ വരൻ സദധയതയണട
✅ True Label: 0
❌ Predicted: 1
--------------------------------------------------
📝 Text: академик ран е д свердлов кроме случая с шекманом также обращает внимание на инцидент с японскими учёными из центра биологии развития института физикохимических исследований riken опубликовавшими в году в журнале две статьи с изложением итогов опытов над зрелыми клетками мышей подвергавшихся различному роду стрессовых воздействий включая погружение в кислоту метод получивший известность под названием получение плюрипотентности вызванной стимулом англ
✅ True Label: 0
❌ Predicted: 1
--------------------------------------------------
📝 Text: wikimedia commons общего централизованного хранилища мультимедийных файлов включаемых в страницы проектов фонда викимедиа
✅ True Label: 0

In [7]:
nb_bow_und = naive_bayes(df_bow_undersampling)

Naive Bayes: 0.9890510948905109
📝 Text: progettando
✅ True Label: 1
❌ Predicted: 0
--------------------------------------------------
📝 Text: à terme il parvint à battre le e meilleur joueur des étatsunis
✅ True Label: 0
❌ Predicted: 1
--------------------------------------------------
📝 Text: con reparar algo
✅ True Label: 0
❌ Predicted: 1
--------------------------------------------------


I display the predicted probabilities from the Naive Bayes model to highlight an important issue: some words (such as "in") appear in multiple languages, which can confuse the classifier and affect its accuracy.

In [8]:
feature_names = vectorizer_bow.get_feature_names_out()
log_prob = nb_bow.feature_log_prob_
prob_not_it = np.exp(log_prob[0])
prob_it = np.exp(log_prob[1])
df_prob_nb = pd.DataFrame({
    "word": feature_names,
    "P(word|not it)": prob_not_it,
    "P(word|it)": prob_it,
})

print("Most Important words for not italian class:")
print(df_prob_nb.sort_values("P(word|not it)", ascending=False).head(10))
print("Most Important words for  italian class:")
print(df_prob_nb.sort_values("P(word|it)", ascending=False).head(10))

Most Important words for not italian class:
            word  P(word|not it)  P(word|it)
5927          de        0.015116    0.000017
24683        the        0.007027    0.000087
8030          en        0.006988    0.000017
20201        que        0.005604    0.000017
14232         la        0.005451    0.003442
17347         of        0.004808    0.000070
12457         in        0.004039    0.003984
7329          du        0.003625    0.000017
1092         and        0.003609    0.000052
27258  wikipedia        0.003380    0.002009
Most Important words for  italian class:
      word  P(word|not it)  P(word|it)
6528    di        0.000005    0.009417
3962   che        0.000005    0.004630
12457   in        0.004039    0.003984
25567   un        0.002584    0.003494
14232   la        0.005451    0.003442
12262   il        0.000496    0.003302
16925  non        0.000109    0.002656
18431  per        0.000065    0.002568
25568  una        0.000845    0.002166
14418   le        0.002088    

We also try Logistic Regression and observe that the models trained without undersampling perform better with TF-IDF vectorizations in terms of accuracy. However, this improvement is based solely on accuracy, so other metrics should be carefully analyzed to get a more complete understanding of model performance.

In [9]:
lr_tfidf = logistic_regression(df_tfidf)

Best Params: {'C': 10000, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV Accuracy: 0.9941923774954627
Logistic Regression: 0.9951620706337687
📝 Text: chiedo e poi dico alla fine
✅ True Label: 1
❌ Predicted: 0
--------------------------------------------------
📝 Text: a dopo
✅ True Label: 1
❌ Predicted: 0
--------------------------------------------------
📝 Text: malheureusement je dois dire non
✅ True Label: 0
❌ Predicted: 1
--------------------------------------------------
📝 Text: non vous avez fait un travail incroyable
✅ True Label: 0
❌ Predicted: 1
--------------------------------------------------
📝 Text: posso offrirti un bicchiere d acqua
✅ True Label: 1
❌ Predicted: 0
--------------------------------------------------
📝 Text: dita incrociate
✅ True Label: 1
❌ Predicted: 0
--------------------------------------------------
📝 Text: dai
✅ True Label: 1
❌ Predicted: 0
--------------------------------------------------
📝 Text: è stato un piacere incontrarti dopo
✅ True Label: 1
❌ Pred

In [10]:
lr_tfidf_und = logistic_regression(df_tfidf_undersampling)

Best Params: {'C': 10000, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV Accuracy: 0.9734657115328222
Logistic Regression: 0.9890510948905109
📝 Text: y escuche la pronunciación una o dos veces
✅ True Label: 0
❌ Predicted: 1
--------------------------------------------------
📝 Text: progettando
✅ True Label: 1
❌ Predicted: 0
--------------------------------------------------
📝 Text: à terme il parvint à battre le e meilleur joueur des étatsunis
✅ True Label: 0
❌ Predicted: 1
--------------------------------------------------


After evaluating the performance of both Multinomial Naive Bayes (MNB) and Logistic Regression (LR) classifiers with various vectorization methods and sampling strategies, I observed mixed results. While Logistic Regression trained on the original, imbalanced dataset performed better in accuracy, the undersampled version with MNB and Bag-of-Words (BoW) showed promising results. Therefore, for the next phase of the analysis, I decided to continue analyzing all the 4 combinations to evaluate other metrics



In [11]:
PATH_MODEL_FOLDER = Path(MODEL_FOLDER)
with (PATH_MODEL_FOLDER / "nb_bow.pkl").open("wb") as f:
    pickle.dump(nb_bow, f)

with (PATH_MODEL_FOLDER / "nb_bow_und.pkl").open("wb") as f:
    pickle.dump(nb_bow_und, f)

with (PATH_MODEL_FOLDER / "lr_tfidf.pkl").open("wb") as f:
    pickle.dump(lr_tfidf, f)

with (PATH_MODEL_FOLDER / "lr_tfidf_und.pkl").open("wb") as f:
    pickle.dump(lr_tfidf_und, f)

I save the vectorizers so that I can use them later during the inference phase.


In [12]:
PATH_VECTORIZERS_FOLDER = Path(VECTORIZERS_FOLDER)

with (PATH_VECTORIZERS_FOLDER / "vectorizer_bow.pkl").open("wb") as f:
    pickle.dump(vectorizer_bow , f)

with (PATH_VECTORIZERS_FOLDER / "vectorizer_bow_und.pkl").open("wb") as f:
    pickle.dump(vectorizer_bow_und , f)

with (PATH_VECTORIZERS_FOLDER / "vectorizer_tfidf.pkl").open("wb") as f:
    pickle.dump(vectorizer_tfidf , f)

with (PATH_VECTORIZERS_FOLDER / "vectorizer_tfidf_und.pkl").open("wb") as f:
    pickle.dump(vectorizer_tfidf_und , f)