In [18]:
import pandas as pd
df = pd.read_parquet("segregated_dataset.parquet")

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import joblib

X_train = df["Text"].astype(str)
y_train = df["language"].astype(str)

vectorizer = CountVectorizer(
    analyzer="char",
    ngram_range=(1, 4),
    min_df=1
)

X_train_vec = vectorizer.fit_transform(X_train)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

cleaned_dataset = pd.read_parquet("cleaned_dataset.parquet")

X_test = cleaned_dataset["Text"].astype(str)
y_test = cleaned_dataset["language"].astype(str)

X_test_vec = vectorizer.transform(X_test)

pred = model.predict(X_test_vec)
print(classification_report(y_test, pred))

joblib.dump(model, "lang_nb_model.pkl")
joblib.dump(vectorizer, "lang_vectorizer.pkl")

              precision    recall  f1-score   support

      Arabic       0.97      1.00      0.98       998
     Chinese       1.00      1.00      1.00       994
       Dutch       0.98      0.98      0.98       999
     English       0.88      1.00      0.93      1000
    Estonian       1.00      0.96      0.98       999
      French       0.97      0.99      0.98      1000
       Hindi       1.00      1.00      1.00       984
  Indonesian       1.00      0.98      0.99      1000
    Japanese       1.00      1.00      1.00       992
      Korean       1.00      1.00      1.00       990
       Latin       0.99      0.93      0.96      1000
     Persian       1.00      1.00      1.00       998
      Pushto       1.00      0.99      0.99       963
    Romanian       1.00      0.99      0.99      1000
     Russian       1.00      1.00      1.00       996
     Spanish       1.00      0.98      0.99      1000
     Swedish       1.00      1.00      1.00      1000
       Tamil       1.00    

['lang_vectorizer.pkl']

In [21]:
model = joblib.load("lang_nb_model.pkl")
vectorizer = joblib.load("lang_vectorizer.pkl")

In [None]:
def smooth_predictions(results, strong_threshold=0.6, weak_threshold=0.05):
    raw_preds = []
    for token, lang_probs in results:
        if lang_probs:
            top_lang, top_prob = lang_probs[0]
            raw_preds.append(top_lang)
        else:
            raw_preds.append(None)

    # Second pass
    final_preds = []

    for i, (token, lang_probs) in enumerate(results):
        if lang_probs[0][1] >= strong_threshold:
            final_preds.append(lang_probs[0][0])
            continue

        # neighbor
        neighbor_langs = []

        if i > 0:
            neighbor_langs.append(raw_preds[i-1])
        if i < len(results)-1:
            neighbor_langs.append(raw_preds[i+1])

        neighbor_langs = [x for x in neighbor_langs if x is not None]

        resolved = None

        for nlang in neighbor_langs:
            for lang, prob in lang_probs:
                if lang == nlang and prob >= weak_threshold:
                    resolved = lang
                    break
            if resolved:
                break

        if resolved:
            final_preds.append(resolved)
        else:
            final_preds.append(lang_probs[0][0])

    return list(zip([t for t,_ in results], final_preds))


In [27]:
def detect_words(text, threshold=0.10):
    words = text.split()

    X = vectorizer.transform(words)
    probs = model.predict_proba(X)
    classes = model.classes_

    results = []

    for i, word in enumerate(words):
        word_probs = []
        for lang, p in zip(classes, probs[i]):
            if p >= threshold:
                word_probs.append((lang, float(p)))

        word_probs.sort(key=lambda x: x[1], reverse=True)
        results.append((word, word_probs))

    return results

In [23]:
def detect_with_smoothing(text):
    results = detect_words(text)
    smoothed = smooth_predictions(results)
    return smoothed

In [None]:
text = ""
smoothed = detect_words(text)

for token, lang in smoothed:
    print(f"{token:10} → {lang}")

In [28]:
import time

def timed_detect_words(text, model, runs=100):
    start = time.perf_counter()

    for _ in range(runs):
        detect_words(text)

    end = time.perf_counter()

    avg_ms = (end - start) * 1000 / runs
    return avg_ms

In [29]:
text = "ini is a mixed sentence 日本語 test"
avg_time = timed_detect_words(text, model, runs=200)

print(f"Average inference time: {avg_time:.3f} ms")

Average inference time: 10.151 ms
