In [49]:
import os
import numpy as np
import pandas as pd
from scipy.linalg import inv
import matplotlib.pyplot as plt
from datasets import load_dataset, DatasetDict
from collections import Counter
from wordcloud import WordCloud
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
data = load_dataset("qanastek/MASSIVE", trust_remote_code=True)
print(data)

DatasetDict({
    train: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 587214
    })
    validation: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 103683
    })
    test: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 151674
    })
})


In [28]:
des_col = ['locale', 'partition', 'utt', 'tokens']

data1 = DatasetDict({
    'train': data['train'].select_columns(des_col),
    'validation': data['validation'].select_columns(des_col),
    'test': data['test'].select_columns(des_col)
})

print(data1)

DatasetDict({
    train: Dataset({
        features: ['locale', 'partition', 'utt', 'tokens'],
        num_rows: 587214
    })
    validation: Dataset({
        features: ['locale', 'partition', 'utt', 'tokens'],
        num_rows: 103683
    })
    test: Dataset({
        features: ['locale', 'partition', 'utt', 'tokens'],
        num_rows: 151674
    })
})


## Task 1

In [51]:
locales = ['af-ZA', 'da-DK', 'de-DE', 'en-US', 'es-ES', 'fr-FR', 'fi-FI', 'hu-HU', 'is-IS', 
           'it-IT', 'jv-ID', 'lv-LV', 'ms-MY', 'nb-NO', 'nl-NL', 'pl-PL', 'pt-PT', 
           'ro-RO', 'ru-RU', 'sl-SL', 'sv-SE', 'sq-AL', 'sw-KE', 'tl-PH', 'tr-TR', 
           'vi-VN', 'cy-GB']

# Directory to store output files
output_dir = "utts_by_locale"
os.makedirs(output_dir, exist_ok=True)

# Function to extract and save utterances for each locale for all dataset partitions
def extr_utt(dataset_dict, locales, output_dir, deaccent):
    partitions = ['train', 'validation', 'test']
    for partition in partitions:
        dataset = dataset_dict[partition]
        for locale in locales:
            locale_data = dataset.filter(lambda example: example['locale'] == locale)
            
            file_path = os.path.join(output_dir, f"{locale}_{partition}.txt")
            with open(file_path, 'w', encoding='utf-8') as file:
                for utt in locale_data['utt']:
                    if deaccent:
                        utt = unidecode(utt)
                    file.write(utt + "\n")       # 1 utterance/line
            
            print(f"Saved {locale} utterances from {partition} partition to {file_path}")

In [None]:
extr_utt(data1, locales, output_dir, True)

## Task 2

In [46]:
train_data = DatasetDict({
    'train': data1['train'].select_columns(['locale','utt','tokens']),
})

val_data = DatasetDict({
    'train': data1['validation'].select_columns(['locale','utt','tokens']),
})

test_data = DatasetDict({
    'train': data1['test'].select_columns(['locale','utt','tokens']),
})

In [None]:
# Feature extraction (TF-IDF vectorization)
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(train_data['utt'])
X_val = vectorizer.transform(val_data['utt'])
X_test = vectorizer.transform(test_data['utt'])

y_train = train_data['locale']
y_val = val_data['locale']
y_test = test_data['locale']

In [None]:
nb_model = MultinomialNB()        # Multinomial Naive Bayes Model

# Grid search to tune 'alpha'
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Training model
nb_model.fit(X_train, y_train)
best_nb_model = grid_search.best_estimator_

In [None]:
# Performance Metrics

# Evaluate on training set
y_train_pred = best_nb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Test Accuracy: {train_accuracy}")
print("Training Performance:\n", classification_report(y_train, y_train_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))

# Evaluate on validation set
y_val_pred = best_nb_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Test Accuracy: {val_accuracy}")
print("Validation Performance:\n", classification_report(y_val, y_val_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Evaluate on test set
y_test_pred = best_nb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")
print("Test Performance:\n", classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

## Task 3

In [None]:
# Define the mapping of languages to continents
continent_mapping = {
    'af-ZA': 'Africa',
    'da-DK': 'Europe',
    'de-DE': 'Europe',
    'en-US': 'North America',
    'es-ES': 'Europe',
    'fr-FR': 'Europe',
    'fi-FI': 'Europe',
    'hu-HU': 'Europe',
    'is-IS': 'Europe',
    'it-IT': 'Europe',
    'jv-ID': 'Asia',
    'lv-LV': 'Europe',
    'ms-MY': 'Asia',
    'nb-NO': 'Europe',
    'nl-NL': 'Europe',
    'pl-PL': 'Europe',
    'pt-PT': 'Europe',
    'ro-RO': 'Europe',
    'ru-RU': 'Europe',
    'sl-SL': 'Europe',
    'sv-SE': 'Europe',
    'sq-AL': 'Europe',
    'sw-KE': 'Africa',
    'tl-PH': 'Asia',
    'tr-TR': 'Asia',
    'vi-VN': 'Asia',
    'cy-GB': 'Europe'
}

# Create a dictionary to hold content for each continent
continent_files = {continent: [] for continent in set(continent_mapping.values())}

# Path where the language files are stored (adjust as necessary)
language_files_path = './languages'  # Example path

# Read each language file and append its content to the corresponding continent list
for lang_code, continent in continent_mapping.items():
    file_path = os.path.join(language_files_path, f"{lang_code}.txt")
    
    # Check if the file exists before reading
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.readlines()  # Read lines from the file
            continent_files[continent].extend(content)  # Append lines to the continent list
    else:
        print(f"File {file_path} does not exist.")

# Save the combined content for each continent into new files
for continent, lines in continent_files.items():
    output_file_path = os.path.join(language_files_path, f"{continent}_dataset.txt")
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.writelines(lines)  # Write all lines to the continent file

print("Datasets have been collapsed into continent groups.")

In [None]:
df = pd.DataFrame(data)

# Feature extraction using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label'].values

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

def fit_rda(X, y, alpha=0.5):
    classes = np.unique(y)
    means = {}
    covariances = {}
    priors = {}

    # Calculate means, covariances, and priors for each class
    for cls in classes:
        X_cls = X[y == cls]
        means[cls] = np.mean(X_cls, axis=0)
        covariances[cls] = np.cov(X_cls, rowvar=False)
        priors[cls] = X_cls.shape[0] / X.shape[0]

    # Calculate the overall covariance
    overall_cov = np.mean(list(covariances.values()), axis=0)

    return means, covariances, priors, overall_cov

def predict_rda(X, means, covariances, priors, overall_cov, alpha=0.5):
    classes = list(means.keys())
    probs = np.zeros((X.shape[0], len(classes)))

    for i, cls in enumerate(classes):
        mu = means[cls]
        cov = (1 - alpha) * covariances[cls] + alpha * overall_cov
        inv_cov = inv(cov)

        # Calculate the quadratic form for the prediction
        probs[:, i] = np.log(priors[cls]) - 0.5 * np.log(np.linalg.det(cov)) \
                      - 0.5 * np.sum((X - mu) @ inv_cov * (X - mu), axis=1)

    return classes[np.argmax(probs, axis=1)]

# Fit the RDA model
means, covariances, priors, overall_cov = fit_rda(X_train, y_train, alpha=0.5)

# Make predictions
y_pred = predict_rda(X_test, means, covariances, priors, overall_cov, alpha=0.5)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))