In [None]:
import pandas as pd
df = pd.read_parquet("../data/segregated_dataset.parquet")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import joblib

X_train = df["Text"].astype(str)
y_train = df["language"].astype(str)

vectorizer = CountVectorizer(
    analyzer="char",
    ngram_range=(1, 4),
    min_df=1
)

X_train_vec = vectorizer.fit_transform(X_train)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

cleaned_dataset = pd.read_parquet("../data/cleaned_dataset.parquet")

X_test = cleaned_dataset["Text"].astype(str)
y_test = cleaned_dataset["language"].astype(str)

X_test_vec = vectorizer.transform(X_test)

pred = model.predict(X_test_vec)
print(classification_report(y_test, pred))

joblib.dump(model, "lang_nb_model.pkl")
joblib.dump(vectorizer, "lang_vectorizer.pkl")

In [None]:
model = joblib.load("lang_nb_model.pkl")
vectorizer = joblib.load("lang_vectorizer.pkl")

In [None]:
def smooth_predictions(results, strong_threshold=0.6, weak_threshold=0.05):
    raw_preds = []
    for token, lang_probs in results:
        if lang_probs:
            top_lang, top_prob = lang_probs[0]
            raw_preds.append(top_lang)
        else:
            raw_preds.append(None)

    # Second pass
    final_preds = []

    for i, (token, lang_probs) in enumerate(results):
        if lang_probs[0][1] >= strong_threshold:
            final_preds.append(lang_probs[0][0])
            continue

        # neighbor
        neighbor_langs = []

        if i > 0:
            neighbor_langs.append(raw_preds[i-1])
        if i < len(results)-1:
            neighbor_langs.append(raw_preds[i+1])

        neighbor_langs = [x for x in neighbor_langs if x is not None]

        resolved = None

        for nlang in neighbor_langs:
            for lang, prob in lang_probs:
                if lang == nlang and prob >= weak_threshold:
                    resolved = lang
                    break
            if resolved:
                break

        if resolved:
            final_preds.append(resolved)
        else:
            final_preds.append(lang_probs[0][0])

    return list(zip([t for t,_ in results], final_preds))


In [None]:
def detect_words(text, threshold=0.10):
    words = text.split()

    X = vectorizer.transform(words)
    probs = model.predict_proba(X)
    classes = model.classes_

    results = []

    for i, word in enumerate(words):
        word_probs = []
        for lang, p in zip(classes, probs[i]):
            if p >= threshold:
                word_probs.append((lang, float(p)))

        word_probs.sort(key=lambda x: x[1], reverse=True)
        results.append((word, word_probs))

    return results

In [None]:
def detect_with_smoothing(text):
    results = detect_words(text)
    smoothed = smooth_predictions(results)
    return smoothed

In [None]:
text = "i eat にちは sapi"
smoothed = detect_with_smoothing(text)

for token, lang in smoothed:
    print(f"{token:10} → {lang}")