# MASSIVE Dataset Language Classification
**This notebook demonstrates language classification tasks using the MASSIVE multilingual dataset.**
 
DATASET_LINK :  https://huggingface.co/datasets/qanastek/MASSIVE

Tasks Covered:
  - Load and inspect the dataset
  - Preprocess data from Roman-script locales
  - Train Naive Bayes for language classification
  - Train LDA/QDA for continent classification

In [2]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Step 1: Setup directory and languages
massive_dataset = load_dataset("qanastek/MASSIVE", "en-US", split='test')
print(massive_dataset)
print(massive_dataset[0])

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'qanastek/MASSIVE' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
Using the latest cached version of the dataset since qanastek/MASSIVE couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'en-US' at C:\Users\singh\.cache\huggingface\datasets\qanastek___massive\en-US\1.0.0\31cdffab94ac97bfe5a394b1e96344c96f0ad847e1d796c7562d8c8b449e22e6 (last modified on Sun Jun 15 19:19:13 2025).


Dataset({
    features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
    num_rows: 2974
})
{'id': '0', 'locale': 'en-US', 'partition': 'test', 'scenario': 9, 'intent': 55, 'utt': 'wake me up at five am this week', 'annot_utt': 'wake me up at [time : five am] [date : this week]', 'tokens': ['wake', 'me', 'up', 'at', 'five', 'am', 'this', 'week'], 'ner_tags': [0, 0, 0, 0, 60, 16, 7, 37], 'worker_id': '1', 'slot_method': {'slot': [], 'method': []}, 'judgments': {'worker_id': [], 'intent_score': [], 'slots_score': [], 'grammar_score': [], 'spelling_score': [], 'language_identification': []}}


In [2]:
# Step 1: Setup directory and languages
massive_dataset = load_dataset("qanastek/MASSIVE", "en-US", split='test', trust_remote_code=True)
print(massive_dataset)
print(massive_dataset[0])

Dataset({
    features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
    num_rows: 2974
})
{'id': '0', 'locale': 'en-US', 'partition': 'test', 'scenario': 9, 'intent': 55, 'utt': 'wake me up at five am this week', 'annot_utt': 'wake me up at [time : five am] [date : this week]', 'tokens': ['wake', 'me', 'up', 'at', 'five', 'am', 'this', 'week'], 'ner_tags': [0, 0, 0, 0, 60, 16, 7, 37], 'worker_id': '1', 'slot_method': {'slot': [], 'method': []}, 'judgments': {'worker_id': [], 'intent_score': [], 'slots_score': [], 'grammar_score': [], 'spelling_score': [], 'language_identification': []}}


In [3]:
languages = [
    'af-ZA', 'da-DK', 'de-DE', 'en-US', 'es-ES', 'fr-FR', 'fi-FI', 'hu-HU', 'is-IS', 'it-IT',
    'jv-ID', 'lv-LV', 'ms-MY', 'nb-NO', 'nl-NL', 'pl-PL', 'pt-PT', 'ro-RO', 'ru-RU', 'sl-SL',
    'sv-SE', 'sq-AL', 'sw-KE', 'tl-PH', 'tr-TR', 'vi-VN', 'cy-GB'
]

In [4]:
continent_lookup = {
    'ZA': 'Africa', 'KE': 'Africa', 'AL': 'Europe', 'GB': 'Europe', 'DK': 'Europe', 'DE': 'Europe',
    'ES': 'Europe', 'FR': 'Europe', 'FI': 'Europe', 'HU': 'Europe', 'IS': 'Europe', 'IT': 'Europe',
    'ID': 'Asia', 'LV': 'Europe', 'MY': 'Asia', 'NO': 'Europe', 'NL': 'Europe', 'PL': 'Europe',
    'PT': 'Europe', 'RO': 'Europe', 'RU': 'Europe', 'SL': 'Europe', 'SE': 'Europe', 'PH': 'Asia',
    'TR': 'Asia', 'VN': 'Asia', 'US': 'North America'
}

In [5]:
# Step 2: Load all splits

def load_massive_split(langs, split):
    all_data = []
    for lang in langs:
        ds = load_dataset("qanastek/MASSIVE", lang, split=split, trust_remote_code=True)
        df = pd.DataFrame(ds)
        df = df[['locale', 'utt']].copy()
        df['split'] = split
        all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

train_df = load_massive_split(languages, 'train')
val_df = load_massive_split(languages, 'validation')
test_df = load_massive_split(languages, 'test')

In [6]:
# Step 3: Train language classifier
X_train = train_df['utt']
y_train = train_df['locale']

X_val = val_df['utt']
y_val = val_df['locale']

X_test = test_df['utt']
y_test = test_df['locale']

In [7]:
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())
pipeline.fit(X_train, y_train)

# Step 4: Evaluate language model
val_preds = pipeline.predict(X_val)
test_preds = pipeline.predict(X_test)

print("\n--- Naive Bayes Language Classification ---")
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print("Validation Report:")
print(classification_report(y_val, val_preds))
print("Test Report:")
print(classification_report(y_test, test_preds))


--- Naive Bayes Language Classification ---
Validation Accuracy: 0.9842050609389518
Test Accuracy: 0.98399711076241
Validation Report:
              precision    recall  f1-score   support

       af-ZA       0.91      0.98      0.94      2033
       cy-GB       1.00      0.99      0.99      2033
       da-DK       0.94      0.96      0.95      2033
       de-DE       1.00      0.98      0.99      2033
       en-US       0.95      0.99      0.97      2033
       es-ES       0.99      0.98      0.98      2033
       fi-FI       1.00      0.98      0.99      2033
       fr-FR       0.99      0.99      0.99      2033
       hu-HU       1.00      0.98      0.99      2033
       is-IS       1.00      0.99      0.99      2033
       it-IT       0.99      0.99      0.99      2033
       jv-ID       0.99      0.98      0.98      2033
       lv-LV       1.00      0.99      0.99      2033
       ms-MY       0.98      0.99      0.99      2033
       nb-NO       0.96      0.94      0.95      2033

In [8]:
# Step 5: Add continent labels
def extract_country(locale):
    return locale.split('-')[1]

def map_continent(locale):
    country = extract_country(locale)
    return continent_lookup.get(country, 'Unknown')

train_df['continent'] = train_df['locale'].apply(map_continent)
val_df['continent'] = val_df['locale'].apply(map_continent)
test_df['continent'] = test_df['locale'].apply(map_continent)

In [9]:
# Step 6: Train LDA/QDA for continent classification
X_train = train_df['utt']
y_train = train_df['continent']
X_val = val_df['utt']
y_val = val_df['continent']
X_test = test_df['utt']
y_test = test_df['continent']

# Vectorize and reduce
vec = TfidfVectorizer()
X_train_vec = vec.fit_transform(X_train)
X_val_vec = vec.transform(X_val)
X_test_vec = vec.transform(X_test)

svd = TruncatedSVD(n_components=100)
X_train_red = svd.fit_transform(X_train_vec)
X_val_red = svd.transform(X_val_vec)
X_test_red = svd.transform(X_test_vec)

lda = LinearDiscriminantAnalysis()
lda.fit(X_train_red, y_train)
val_lda = lda.predict(X_val_red)
test_lda = lda.predict(X_test_red)

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train_red, y_train)
val_qda = qda.predict(X_val_red)
test_qda = qda.predict(X_test_red)

print("\n--- LDA Continent Classification ---")
print("Validation Accuracy:", accuracy_score(y_val, val_lda))
print("Test Accuracy:", accuracy_score(y_test, test_lda))
print(classification_report(y_val, val_lda))

print("\n--- QDA Continent Classification ---")
print("Validation Accuracy:", accuracy_score(y_val, val_qda))
print("Test Accuracy:", accuracy_score(y_test, test_qda))
print(classification_report(y_test, test_qda))


--- LDA Continent Classification ---
Validation Accuracy: 0.8975606201380919
Test Accuracy: 0.8945428279658273
               precision    recall  f1-score   support

       Africa       0.90      0.71      0.79      4066
         Asia       0.99      0.67      0.80     10165
       Europe       0.88      0.99      0.93     38627
North America       0.87      0.73      0.80      2033

     accuracy                           0.90     54891
    macro avg       0.91      0.77      0.83     54891
 weighted avg       0.90      0.90      0.89     54891


--- QDA Continent Classification ---
Validation Accuracy: 0.7955402525004099
Test Accuracy: 0.7911405016314229
               precision    recall  f1-score   support

       Africa       0.62      0.94      0.75      5948
         Asia       0.80      0.93      0.86     14870
       Europe       0.99      0.73      0.84     56506
North America       0.24      0.99      0.38      2974

     accuracy                           0.79     80298
 

In [11]:
# Step 7: Save the models and vectorizer
import joblib
joblib.dump(pipeline, 'language_classifier.pkl')
joblib.dump(vec, 'tfidf_vectorizer.pkl')
joblib.dump(svd, 'svd_reducer.pkl')
joblib.dump(lda, 'lda_model.pkl')
joblib.dump(qda, 'qda_model.pkl')
# Step 8: Save the dataframes
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)
# Save the continent mappings
joblib.dump(continent_lookup, 'continent_lookup.pkl')
# Save the language mappings
language_lookup = {lang: lang.split('-')[0] for lang in languages}
joblib.dump(language_lookup, 'language_lookup.pkl')
# Save the language and continent mappings
language_continent_lookup = {lang: map_continent(lang) for lang in languages}
joblib.dump(language_continent_lookup, 'language_continent_lookup.pkl')
# Step 9: Print completion message
print("Models and data saved successfully. You can now use the saved models for predictions.")
# Step 10: Load and test saved models
def load_and_test_models():
    # Load the models and vectorizer
    language_classifier = joblib.load('language_classifier.pkl')
    tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
    svd_reducer = joblib.load('svd_reducer.pkl')
    lda_model = joblib.load('lda_model.pkl')
    qda_model = joblib.load('qda_model.pkl')

    # Load the dataframes
    train_data = pd.read_csv('train_df.csv')
    val_data = pd.read_csv('val_data.csv')
    test_data = pd.read_csv('test_data.csv')

    # Test the language classifier
    sample_texts = ["Hello, how are you?", "Bonjour, comment ça va?", "Hola, ¿cómo estás?"]
    sample_preds = language_classifier.predict(sample_texts)
    print("Sample Predictions for Language Classifier:", sample_preds)
    # Test the continent classifier
    sample_texts_vec = tfidf_vectorizer.transform(sample_texts)
    sample_texts_red = svd_reducer.transform(sample_texts_vec)
    lda_preds = lda_model.predict(sample_texts_red)
    qda_preds = qda_model.predict(sample_texts_red)
    print("Sample Predictions for LDA Continent Classifier:", lda_preds)
    print("Sample Predictions for QDA Continent Classifier:", qda_preds)
    return sample_preds, lda_preds, qda_preds
# Load and test the saved models
# load_and_test_models()
# Uncomment the line below to test the saved models
# load_and_test_models()
# Step 11: Finalize the script
if __name__ == "__main__":
    print("Script executed successfully. All models and data are saved.")
   

Models and data saved successfully. You can now use the saved models for predictions.
Script executed successfully. All models and data are saved.


In [None]:
 # Uncomment the line below to test the saved models
    # load_and_test_models()
# End of the script 
