In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib
import numpy as np

In [11]:
df = pd.read_parquet("segregated_dataset.parquet")

In [12]:
X_train = df["Text"].astype(str)
y_train = df["language"].astype(str)

vectorizer = CountVectorizer(
    analyzer="char",
    ngram_range=(1, 4),
    min_df=1
)

X_train_vec = vectorizer.fit_transform(X_train)

# Train model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Optional evaluation
cleaned_dataset = pd.read_parquet("cleaned_dataset.parquet")
X_test = cleaned_dataset["Text"].astype(str)
y_test = cleaned_dataset["language"].astype(str)

X_test_vec = vectorizer.transform(X_test)
pred = model.predict(X_test_vec)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

      Arabic       0.97      1.00      0.98       998
     Chinese       1.00      1.00      1.00       994
       Dutch       0.98      0.98      0.98       999
     English       0.88      1.00      0.93      1000
    Estonian       1.00      0.96      0.98       999
      French       0.97      0.99      0.98      1000
       Hindi       1.00      1.00      1.00       984
  Indonesian       1.00      0.98      0.99      1000
    Japanese       1.00      1.00      1.00       992
      Korean       1.00      1.00      1.00       990
       Latin       0.99      0.93      0.96      1000
     Persian       1.00      1.00      1.00       998
      Pushto       1.00      0.99      0.99       963
    Romanian       1.00      0.99      0.99      1000
     Russian       1.00      1.00      1.00       996
     Spanish       1.00      0.98      0.99      1000
     Swedish       1.00      1.00      1.00      1000
       Tamil       1.00    

In [13]:
probs_test = model.predict_proba(X_test_vec)
max_probs = probs_test.max(axis=1)

print("Mean max probability:", max_probs.mean())
print("Min max probability:", max_probs.min())
print("25th percentile:", np.percentile(max_probs, 25))

Mean max probability: 0.9996482719015121
Min max probability: 0.5180897675181394
25th percentile: 1.0


In [14]:
import joblib
joblib.dump(model, "lang_nb_model.pkl")
joblib.dump(vectorizer, "lang_vectorizer.pkl")
joblib.dump(model.classes_, "lang_classes.pkl")

['lang_classes.pkl']

In [15]:
model = joblib.load("lang_nb_model.pkl")
vectorizer = joblib.load("lang_vectorizer.pkl")
classes = joblib.load("lang_classes.pkl")

def smooth_predictions(results, strong_threshold=0.6, weak_threshold=0.05):
    raw_preds = []
    for token, lang_probs in results:
        if lang_probs:
            raw_preds.append(lang_probs[0][0])
        else:
            raw_preds.append(None)

    final_preds = []

    for i, (token, lang_probs) in enumerate(results):
        # Preserve Unknown
        if lang_probs[0][0] == "Unknown":
            final_preds.append("Unknown")
            continue

        if lang_probs[0][1] >= strong_threshold:
            final_preds.append(lang_probs[0][0])
            continue

        neighbor_langs = []
        if i > 0:
            neighbor_langs.append(raw_preds[i - 1])
        if i < len(results) - 1:
            neighbor_langs.append(raw_preds[i + 1])

        neighbor_langs = [n for n in neighbor_langs if n is not None]

        resolved = None
        for nlang in neighbor_langs:
            for lang, prob in lang_probs:
                if lang == nlang and prob >= weak_threshold:
                    resolved = lang
                    break
            if resolved:
                break

        final_preds.append(resolved if resolved else lang_probs[0][0])

    return list(zip([t for t, _ in results], final_preds))

In [16]:
def detect_words(text, threshold=0.10, unknown_threshold=0.25):
    words = [w.strip() for w in text.split() if w.strip()]
    if not words:
        return []

    X = vectorizer.transform(words)
    probs = model.predict_proba(X)

    results = []

    for i, word in enumerate(words):
        word_probs = [
            (lang, float(p))
            for lang, p in zip(classes, probs[i])
            if p >= threshold
        ]

        word_probs.sort(key=lambda x: x[1], reverse=True)

        if not word_probs or word_probs[0][1] < unknown_threshold:
            results.append((word, [("Unknown", 1.0)]))
        else:
            results.append((word, word_probs))

    return results

In [17]:
def detect_with_smoothing(text):
    results = detect_words(text)
    return smooth_predictions(results)

In [18]:
text = "saya naive reoreorie"
results = detect_words(text)

for token, probs in results:
    print(token, "→", probs)

saya → [(np.str_('Turkish'), 0.9227891641142754)]
naive → [(np.str_('French'), 0.797291885774436)]
reoreorie → [(np.str_('Romanian'), 0.9985132915646493)]
