## 1) Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter, defaultdict
import math
from typing import Dict, List, Tuple, Optional
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

## 2) Dataset Setup Instructions

Before executing the following cells, you need to download the dataset from the Tatoeba Project:

### Step 1: Download the Dataset

- Go to: https://downloads.tatoeba.org/exports/sentences.tar.bz2
- Download the compressed file (approximately 194 MB)
- Extract the archive - you'll get a file called sentences.csv (approximately 678 MB)

### Step 2: Place the File

Put the extracted sentences.csv file in the same directory as this notebook

The file structure should look like:

```
your-project-folder/
├── language_id.ipynb  # This notebook
└── sentences.csv      # The dataset
```

### What's in the Dataset

- Over 14 million sentences in many different languages
- Tab-separated format with columns: ID, Language Code, Sentence
- Languages are coded using ISO 639-3 format (e.g., 'eng' for English)
- Highly imbalanced - some languages have millions of sentences, others just a few hundred


In [3]:
# Load the CSV dataset
file_path = "sentences.csv"

try:
    # The dataset is tab-separated with no header, so we define column names
    df = pd.read_csv(file_path, sep = "\t", header = None, names = ["ID", "Lang", "Sentence"])
    print(f"Total rows: {len(df):,}")
    print(f"Available languages: {df['Lang'].nunique()}")
    print(f"Sample languages:\n{df['Lang'].value_counts().head()}")
    
except FileNotFoundError:
    print("Error: 'sentences.csv' file not found!")
    raise

# Show basic dataset information
print(f"\nDataset Overview:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Total rows: 12,775,409
Available languages: 423
Sample languages:
eng    1978526
rus    1127493
ita     918424
epo     789476
kab     764433
Name: Lang, dtype: int64

Dataset Overview:
Shape: (12775409, 3)
Columns: ['ID', 'Lang', 'Sentence']


## 3) Filter, Sample, and Prepare Training Data

This step processes the raw dataset into training-ready format:
- Filters to target languages (13 diverse languages)
- Samples equal sentences per language (balanced dataset)
- Maps language codes (ISO 639-3 → ISO 639-1)
- Creates dictionary structure for model training

The balanced approach ensures no language dominates during training.

In [6]:
# List of target languages we want to include
target_languages = [
    "eng",  # English
    "deu",  # German  
    "fra",  # French
    "spa",  # Spanish
    "ita",  # Italian
    "por",  # Portuguese
    "rus",  # Russian
    "jpn",  # Japanese
    "cmn",  # Chinese (Mandarin)
    "tur",  # Turkish
    "pol",  # Polish
    "kor",  # Korean
    "fin"   # Finnish
]

# Friendly names for display
language_names = {
    "eng": "English", 
    "deu": "German", 
    "fra": "French", 
    "spa": "Spanish",
    "ita": "Italian", 
    "por": "Portuguese", 
    "rus": "Russian", 
    "jpn": "Japanese", 
    "cmn": "Chinese", 
    "tur": "Turkish", 
    "pol": "Polish", 
    "kor": "Korean", 
    "fin": "Finnish"}

# Filter the DataFrame
df_filtered = df[df["Lang"].isin(target_languages)]

print(f"Total rows after filtering: {len(df_filtered):,}")
print(f"Language distribution:")

# Check availability and show counts
available_languages = []
for lang in target_languages:
    count = len(df_filtered[df_filtered["Lang"] == lang])
    if count > 0:
        available_languages.append(lang)
        print(f"{lang} ({language_names[lang]}): {count:,} sentences")
    else:
        print(f"{lang} ({language_names[lang]}): Not available")

# Update target languages to only include available ones
target_languages = available_languages
print(f"\nWill proceed with {len(target_languages)} languages")

# Determine sample size based on smallest language
min_samples = min([len(df_filtered[df_filtered["Lang"] == lang]) for lang in target_languages])
sample_size = min(800, min_samples)

print(f"\nSampling {sample_size} sentences per language.")

samples = (df_filtered[df_filtered["Lang"].isin(target_languages)]
           .groupby("Lang")
           .apply(lambda x: x.sample(n=min(sample_size, len(x)), random_state=42))
           .reset_index(drop=True))

# Check final sample counts
print("Final sample distribution:")
sample_counts = samples["Lang"].value_counts().sort_index()
for lang in target_languages:
    count = sample_counts.get(lang, 0)
    print(f"{lang} ({language_names[lang]}): {count}")

# Print one example per language (first 6 to avoid too much output)
# print(f"\nSample sentences:")
# for lang in target_languages[:6]:
#     if lang in samples["Lang"].values:
#         sample_text = samples[samples["Lang"] == lang]["Sentence"].iloc[0]
#         print(f"{lang}: {sample_text}")

Total rows after filtering: 7,621,439
Language distribution:
eng (English): 1,978,526 sentences
deu (German): 721,388 sentences
fra (French): 666,311 sentences
spa (Spanish): 414,431 sentences
ita (Italian): 918,424 sentences
por (Portuguese): 436,229 sentences
rus (Russian): 1,127,493 sentences
jpn (Japanese): 245,837 sentences
cmn (Chinese): 77,801 sentences
tur (Turkish): 738,242 sentences
pol (Polish): 131,575 sentences
kor (Korean): 13,984 sentences
fin (Finnish): 151,198 sentences

Will proceed with 13 languages

Sampling 800 sentences per language.
Final sample distribution:
eng (English): 800
deu (German): 800
fra (French): 800
spa (Spanish): 800
ita (Italian): 800
por (Portuguese): 800
rus (Russian): 800
jpn (Japanese): 800
cmn (Chinese): 800
tur (Turkish): 800
pol (Polish): 800
kor (Korean): 800
fin (Finnish): 800


## 4) Save Filtered Dataset

The processed dataset is saved to a new CSV file.

The saved file `filtered_sentences.csv` will be used in the next steps for training and evaluation.


In [7]:
# Save the processed dataset
output_file = "filtered_sentences.csv"
samples.to_csv(output_file, index = False)

print(f"Filtered dataset saved as '{output_file}'")
print(f"Total rows saved: {len(samples):,}")
print(f"File size: {os.path.getsize(output_file) / 1024:.1f} KB")

Filtered dataset saved as 'filtered_sentences.csv'
Total rows saved: 10,400
File size: 571.5 KB


## 5) Load and Prepare Training Data

This function reads the filtered dataset and organizes it for model training:

- **Language Code Mapping**: Converts ISO 639-3 codes (e.g., 'eng') to shorter ISO 639-1 codes (e.g., 'en')
- **Data Structure**: Creates a dictionary where keys are language codes and values are lists of sentences
- **Statistics**: Shows the number of sentences and average length per language

The output dictionary is ready to be fed directly into the `LanguageIdentifier.train()` method.

In [8]:
def load_training_data(filepath: str) -> Dict[str, List[str]]:
    print(f"Loading training data from {filepath}")
    
    df = pd.read_csv(filepath)
    
    # Clean and normalize language codes
    df['Lang'] = df['Lang'].astype(str).str.strip().str.lower()
    
    # Extended language mapping (ISO 639-3 to ISO 639-1 where possible)
    lang_map = {
        'eng': 'en', 
        'deu': 'de', 
        'fra': 'fr', 
        'spa': 'es',
        'ita': 'it', 
        'por': 'pt', 
        'rus': 'ru', 
        'jpn': 'ja',
        'cmn': 'zh', 
        'tur': 'tr', 
        'pol': 'pl', 
        'kor': 'ko', 
        'fin': 'fi'}
    
    df['Lang'] = df['Lang'].map(lang_map)
    
    print(f"Languages found: {sorted(df['Lang'].unique())}")
    
    # Drop rows where mapping failed
    df = df.dropna(subset=['Lang'])
    
    # Convert to dictionary format
    training_data = defaultdict(list)
    for _, row in df.iterrows():
        training_data[row['Lang']].append(row['Sentence'])
    
    # Convert to regular dict and show statistics
    training_data = dict(training_data)
    print(f"Training data prepared for {len(training_data)} languages:")
    for lang, texts in sorted(training_data.items()):
        print(f"{lang}: {len(texts):,} sentences")
        # Show average sentence length
        avg_length = sum(len(text) for text in texts) / len(texts)
        print(f"Average length: {avg_length:.1f} characters")
    
    return training_data

## 6) Define the Language Identifier Class

This class implements the core language detection algorithm using character n-gram profiles and rank-order distance comparison.

### How it works:

- **Training**: Extract character n-grams from texts, build frequency profiles for each language
- **Prediction**: Compare input text's n-gram profile against all language profiles using distance metrics
- **Parameters**: Uses 3-character n-grams and top 300 features per language by default

The algorithm identifies languages by finding the closest matching n-gram profile based on statistical similarity.


In [10]:
"""
Language Identification using Character N-gram Profiles
Based on the rank-order statistical method described in:
Cavnar, W. B., & Trenkle, J. M. (1994). N-gram-based text categorization.
Proceedings of SDAIR-94, 3rd annual symposium on document analysis and information retrieval.
"""

class LanguageIdentifier:
    def __init__(self, n_gram_size: int = 3, max_features: int = 300):
        """
        Initialize the Language Identifier
        
        Args:
            n_gram_size: Size of character n-grams (default: 3)
            max_features: Maximum number of n-gram features to use per language
        """
        self.n_gram_size = n_gram_size
        self.max_features = max_features
        self.language_profiles = {}
        self.trained = False
    
    def _extract_ngrams(self, text: str) -> List[str]:
        """Extract character n-grams from text"""
        # Clean and normalize text
        text = re.sub(r'[^\w\s]', ' ', text.lower())  # remove punctuation
        text = re.sub(r'\s+', ' ', text.strip())      # normalize whitespace
        
        # Add padding to capture beginning and end of text
        padded_text = '_' * (self.n_gram_size - 1) + text + '_' * (self.n_gram_size - 1)
        
        # Extract n-grams
        ngrams = []
        for i in range(len(padded_text) - self.n_gram_size + 1):
            ngrams.append(padded_text[i:i + self.n_gram_size])
        
        return ngrams
    
    def _create_profile(self, ngrams: List[str]) -> Dict[str, int]:
        """Create frequency profile from n-grams"""
        counter = Counter(ngrams)
        # Return top features ranked by frequency
        return dict(counter.most_common(self.max_features))
    
    def train(self, training_data: Dict[str, List[str]]):
        """
        training_data: Dict with language codes as keys and list of texts as values
        
        building n-gram profiles for each language in the training data and stores them
        """
        for language, texts in training_data.items():
            print(f"Processing {language} - ({len(texts)} texts)")
            all_ngrams = []
            
            for text in texts:
                ngrams = self._extract_ngrams(text)
                all_ngrams.extend(ngrams)
            
            # Create language profile
            self.language_profiles[language] = self._create_profile(all_ngrams)
            print(f"Created profile with {len(self.language_profiles[language])} n-grams")
        
        self.trained = True
        print(f"Training completed for {len(self.language_profiles)} languages")
    
    def _calculate_distance(self, doc_profile: Dict[str, int], lang_profile: Dict[str, int]) -> float:
        """
        Compute the rank-order distance between a document's n-gram profile and a known language profile.
        (`doc_profile` and `lang_profile` are dictionaries containing the top n-grams for the input text 
        and each known language, ranked by frequency.)

        The distance is calculated by comparing the rank positions of shared n-grams.
        For n-grams missing in the language profile, a fixed penalty is added.
        Lower distance indicates higher similarity between the profiles.
        """
        distance = 0
        
        # Create rank dictionaries
        doc_ranks = {ngram: rank for rank, (ngram, _) in enumerate(doc_profile.items())}
        lang_ranks = {ngram: rank for rank, (ngram, _) in enumerate(lang_profile.items())}
        
        # Calculate distance
        for ngram in doc_ranks:
            if ngram in lang_ranks:
                distance += abs(doc_ranks[ngram] - lang_ranks[ngram])
            else:
                # Penalty for n-grams not in language profile
                distance += self.max_features
        
        return distance
    
    def identify_language(self, text: str, return_scores: bool = False):
        """
        Identify the language of given text
        
        Args:
            text: Input text to identify
            return_scores: If True, returns (language, scores_dict)
        
        Returns:
            Detected language code or tuple with scores if return_scores=True
        """
        if not self.trained:
            raise ValueError("Model must be trained before prediction")
        
        # Extract n-grams and create profile
        ngrams = self._extract_ngrams(text)
        doc_profile = self._create_profile(ngrams)
        
        # Calculate distances to all language profiles
        distances = {}
        for language, lang_profile in self.language_profiles.items():
            distances[language] = self._calculate_distance(doc_profile, lang_profile)
        
        # Find language with minimum distance
        predicted_language = min(distances, key = distances.get)
        
        if return_scores:
            # Convert distances to similarity scores (lower distance = higher similarity)
            max_distance = max(distances.values()) if distances.values() else 1
            scores = {lang: 1 - (dist / max_distance) for lang, dist in distances.items()}
            return predicted_language, scores
        
        return predicted_language

# print("LanguageIdentifier class defined successfully!")

## 7) Initialize and Train the Model

Here we create an instance of `LanguageIdentifier`, set it to use 3-character n-grams,  
and train it on the filtered dataset using the top 300 most frequent n-grams per language.


In [11]:
training_data = load_training_data("filtered_sentences.csv")

# Initialize the identifier with optimal parameters
identifier = LanguageIdentifier(n_gram_size = 3, max_features = 300)

print(f"N-gram size: {identifier.n_gram_size}")
print(f"Max features per language: {identifier.max_features}")

# Train the model
print("\n" + "="*50)
identifier.train(training_data)

Loading training data from filtered_sentences.csv
Languages found: ['de', 'en', 'es', 'fi', 'fr', 'it', 'ja', 'ko', 'pl', 'pt', 'ru', 'tr', 'zh']
Training data prepared for 13 languages:
de: 800 sentences
Average length: 46.7 characters
en: 800 sentences
Average length: 41.9 characters
es: 800 sentences
Average length: 38.7 characters
fi: 800 sentences
Average length: 35.4 characters
fr: 800 sentences
Average length: 40.9 characters
it: 800 sentences
Average length: 34.7 characters
ja: 800 sentences
Average length: 18.1 characters
ko: 800 sentences
Average length: 15.3 characters
pl: 800 sentences
Average length: 32.8 characters
pt: 800 sentences
Average length: 43.5 characters
ru: 800 sentences
Average length: 33.0 characters
tr: 800 sentences
Average length: 33.7 characters
zh: 800 sentences
Average length: 11.3 characters
N-gram size: 3
Max features per language: 300

Processing zh - (800 texts)
Created profile with 300 n-grams
Processing de - (800 texts)
Created profile with 300 n-

## 8) Evaluate and Test

In [12]:
class EfficientEvaluation:
    
    def __init__(self, trained_identifier, test_size = 0.2, random_state = 42):
        self.identifier = trained_identifier
        self.test_size = test_size
        self.random_state = random_state
        self.results = {}
        
        # Verify the model is trained
        if not self.identifier.trained:
            raise ValueError("The provided identifier must be already trained!")
        
        print(f"Using pre-trained model with {len(self.identifier.language_profiles)} languages")
        print(f"Languages: {', '.join(sorted(self.identifier.language_profiles.keys()))}")
    
    def prepare_test_data(self, data_path = "filtered_sentences.csv"):        
        # Load data
        df = pd.read_csv(data_path)
        
        # Language mapping (same as training)
        lang_map = {
            'eng': 'en', 
            'deu': 'de', 
            'fra': 'fr', 
            'spa': 'es',
            'ita': 'it', 
            'por': 'pt', 
            'rus': 'ru', 
            'jpn': 'ja',
            'cmn': 'zh', 
            'tur': 'tr', 
            'pol': 'pl', 
            'kor': 'ko', 
            'fin': 'fi'
        }
        
        df['Lang'] = df['Lang'].map(lang_map)
        df = df.dropna(subset=['Lang'])
        
        # Only keep languages that our model supports
        model_languages = set(self.identifier.language_profiles.keys())
        df = df[df['Lang'].isin(model_languages)]
        
        print(f"Available test samples: {len(df)}")
        print(f"Languages in test data: {sorted(df['Lang'].unique())}")
        
        # Create train/test split for evaluation
        X = df['Sentence'].values
        y = df['Lang'].values
        
        # Use stratified split to maintain class balance
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = self.test_size, stratify = y, random_state = self.random_state)
        
        print(f"Test set size: {len(X_test)} samples")
        
        # Show test set distribution
        test_dist = pd.Series(y_test).value_counts().sort_index()
        
        print("Test set distribution:")
        
        for lang, count in test_dist.items():
            print(f"{lang}: {count} samples")
        
        return X_test, y_test
    
    def evaluate_model(self, X_test, y_test):
        print(f"\nEvaluating model on {len(X_test)} test samples")
        
        predictions = []
        confidences = []
        prediction_scores = []
        
        # Get predictions for all test samples
        for i, text in enumerate(X_test):
#             if (i + 1) % 100 == 0:
#                 print(f"Processed {i + 1}/{len(X_test)} samples")       
            try:
                pred, scores = self.identifier.identify_language(text, return_scores = True)
                predictions.append(pred)
                
                # Get confidence as max score
                max_confidence = max(scores.values()) if scores else 0
                confidences.append(max_confidence)
                prediction_scores.append(scores)
                
            except Exception as e:
                print(f"Error processing sample {i}: {e}")
                predictions.append('unknown')
                confidences.append(0.0)
                prediction_scores.append({})
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average = 'weighted')
        
        # Store results
        self.results = {
            'test_accuracy': accuracy,
            'test_f1': f1,
            'predictions': predictions,
            'true_labels': y_test,
            'confidences': confidences,
            'prediction_scores': prediction_scores,
            'test_texts': X_test
        }
        
        print(f"Evaluation completed!")
        print(f"Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
        print(f"F1-Score: {f1:.3f}")
        print(f"Average Confidence: {np.mean(confidences):.3f}")
        
        return accuracy, f1
    
    def detailed_report(self):
        """
        Generate detailed classification report
        """
        if not self.results:
            print("No results available. Run evaluate_model first.")
            return
        
        print(f"\nDETAILED CLASSIFICATION REPORT")
        print("=" * 60)
        
        # Overall classification report
        report = classification_report(self.results['true_labels'], self.results['predictions'], digits = 3)
        print(report)
        
        # Per-language detailed analysis
        print(f"\nPER-LANGUAGE ANALYSIS:")
        print("-" * 40)
        
        languages = sorted(set(self.results['true_labels']))
        
        for lang in languages:
            # Get indices for this language
            lang_mask = np.array(self.results['true_labels']) == lang
            lang_indices = np.where(lang_mask)[0]
            
            if len(lang_indices) > 0:
                lang_true = np.array(self.results['true_labels'])[lang_indices]
                lang_pred = np.array(self.results['predictions'])[lang_indices]
                lang_conf = np.array(self.results['confidences'])[lang_indices]
                
                # Calculate metrics
                lang_accuracy = accuracy_score(lang_true, lang_pred)
                correct_mask = lang_true == lang_pred
                correct_conf = lang_conf[correct_mask]
                incorrect_conf = lang_conf[~correct_mask]
                
                print(f"\n{lang.upper()}:")
                print(f"Accuracy: {lang_accuracy:.3f} ({len(lang_indices)} samples)")
                print(f"Avg confidence (correct): {np.mean(correct_conf):.3f}")
                if len(incorrect_conf) > 0:
                    print(f"Avg confidence (incorrect): {np.mean(incorrect_conf):.3f}")
                    
                    # Show most common errors for this language
                    wrong_predictions = lang_pred[~correct_mask]
                    if len(wrong_predictions) > 0:
                        error_counts = pd.Series(wrong_predictions).value_counts()
                        print(f"Most confused with: {error_counts.head(3).to_dict()}")
    
    def confusion_matrix_analysis(self):
        """Analyze confusion matrix"""
        if not self.results:
            print("No results available.")
            return
        
        print(f"\nCONFUSION MATRIX ANALYSIS")
        print("=" * 40)
        
        # Calculate confusion matrix
        cm = confusion_matrix(self.results['true_labels'], self.results['predictions'])
        languages = sorted(set(self.results['true_labels']))
        
        # Create DataFrame for better visualization
        cm_df = pd.DataFrame(cm, index = languages, columns = languages)
        print("Confusion Matrix:")
        print(cm_df.to_string())
        
        # Find most problematic pairs
        print(f"\nMOST CONFUSED LANGUAGE PAIRS:")
        print("-" * 35)
        
        confusion_pairs = []
        for i, lang1 in enumerate(languages):
            for j, lang2 in enumerate(languages):
                if i != j and cm[i][j] > 0:
                    confusion_pairs.append((lang1, lang2, cm[i][j]))
        
        # Sort by confusion count
        confusion_pairs.sort(key=lambda x: x[2], reverse=True)
        
        for lang1, lang2, count in confusion_pairs[:8]:  # Top 8 confusions
            total_lang1 = sum(cm[languages.index(lang1)])
            error_rate = (count / total_lang1) * 100 if total_lang1 > 0 else 0
            print(f"  {lang1} → {lang2}: {count} errors ({error_rate:.1f}%)")
    
    def error_analysis(self, n_examples=10):
        """Detailed analysis of prediction errors"""
        if not self.results:
            print("No results available.")
            return
        
        print(f"\nERROR ANALYSIS")
        print("=" * 30)
        
        # Find all errors
        errors = []
        for i, (true_lang, pred_lang, text, conf) in enumerate(
            zip(self.results['true_labels'], self.results['predictions'], 
                self.results['test_texts'], self.results['confidences'])
        ):
            if true_lang != pred_lang:
                errors.append({
                    'index': i,
                    'true_lang': true_lang,
                    'pred_lang': pred_lang,
                    'text': text,
                    'confidence': conf,
                    'text_length': len(text)
                })
        
        error_rate = len(errors) / len(self.results['true_labels']) * 100
        print(f"Total errors: {len(errors)}/{len(self.results['true_labels'])} ({error_rate:.1f}%)")
        
        if errors:
            # Sort errors by confidence (high confidence errors are more interesting)
            errors.sort(key=lambda x: x['confidence'], reverse=True)
            
            print(f"\nSample errors (showing {min(n_examples, len(errors))}):")
            print("-" * 60)
            
            for i, error in enumerate(errors[:n_examples], 1):
                print(f"{i}. TRUE: {error['true_lang'].upper()} | PRED: {error['pred_lang'].upper()} | CONF: {error['confidence']:.3f}")
                text_display = error['text'][:120] + "..." if len(error['text']) > 120 else error['text']
                print(f"Text ({error['text_length']} chars): {text_display}")
                print()
            
            # Analyze error patterns
            print(f"ERROR PATTERNS:")
            print("-" * 20)
            
            # Errors by text length
            error_lengths = [e['text_length'] for e in errors]
            print(f"Average error text length: {np.mean(error_lengths):.1f} chars")
            print(f"Shortest error text: {min(error_lengths)} chars")
            print(f"Longest error text: {max(error_lengths)} chars")
            
            # High confidence errors (model was wrong but confident)
            high_conf_errors = [e for e in errors if e['confidence'] > 0.8]
            if high_conf_errors:
                print(f"High confidence errors (>0.8): {len(high_conf_errors)}")
    
    def save_results(self, filepath="evaluation_results.json"):
        """Save evaluation results"""
        if not self.results:
            print("No results to save.")
            return
        
        # Prepare results for JSON serialization
        save_data = {
            'test_accuracy': float(self.results['test_accuracy']),
            'test_f1': float(self.results['test_f1']),
            'average_confidence': float(np.mean(self.results['confidences'])),
            'total_samples': len(self.results['predictions']),
            'model_languages': list(self.identifier.language_profiles.keys()),
            'evaluation_timestamp': pd.Timestamp.now().isoformat()
        }
        
        import json
        with open(filepath, 'w') as f:
            json.dump(save_data, f, indent=2)
        
        print(f"Results saved to {filepath}")

# ================================
# USAGE EXAMPLE
# ================================

def run_evaluation(trained_identifier, data_path="filtered_sentences.csv"):

    print("STARTING EVALUATION")
    print("=" * 50)
    
    # Initialize evaluator with pre-trained model
    evaluator = EfficientEvaluation(trained_identifier, test_size=0.3)
    
    # Prepare test data
    X_test, y_test = evaluator.prepare_test_data(data_path)
    
    # Evaluate model
    accuracy, f1 = evaluator.evaluate_model(X_test, y_test)
    
    # Generate detailed reports
    evaluator.detailed_report()
    evaluator.confusion_matrix_analysis()
    evaluator.error_analysis(n_examples=5)
    
    # Save results
    evaluator.save_results()
    
    print(f"\nEVALUATION COMPLETED!")
    print(f"Final Results: Accuracy={accuracy:.3f}, F1={f1:.3f}")
    
    return evaluator

# Run evaluation using trained model
print("Starting evaluation with pre-trained model")
evaluator = run_evaluation(identifier, "filtered_sentences.csv")

Starting evaluation with pre-trained model
STARTING EVALUATION
Using pre-trained model with 13 languages
Languages: de, en, es, fi, fr, it, ja, ko, pl, pt, ru, tr, zh
Available test samples: 10400
Languages in test data: ['de', 'en', 'es', 'fi', 'fr', 'it', 'ja', 'ko', 'pl', 'pt', 'ru', 'tr', 'zh']
Test set size: 3120 samples
Test set distribution:
de: 240 samples
en: 240 samples
es: 240 samples
fi: 240 samples
fr: 240 samples
it: 240 samples
ja: 240 samples
ko: 240 samples
pl: 240 samples
pt: 240 samples
ru: 240 samples
tr: 240 samples
zh: 240 samples

Evaluating model on 3120 test samples
Evaluation completed!
Accuracy: 0.968 (96.8%)
F1-Score: 0.968
Average Confidence: 0.340

DETAILED CLASSIFICATION REPORT
              precision    recall  f1-score   support

          de      0.975     0.983     0.979       240
          en      0.959     0.979     0.969       240
          es      0.919     0.896     0.907       240
          fi      0.979     0.992     0.986       240
          f

## 9) Model Analysis and Statistics

In this part, we explore which character n-grams are most frequent in each language profile.  
We also calculate overlap between languages to see how similar or different their profiles are.


In [13]:
# Analyze the language profiles
for language, profile in identifier.language_profiles.items():
    print(f"\n Top 15 n-grams for {language.upper()}:")
    
    # Sort by frequency and display top n-grams
    sorted_ngrams = sorted(profile.items(), key = lambda x: x[1], reverse = True)[:15]
    
    for i, (ngram, freq) in enumerate(sorted_ngrams, 1):
        # Replace underscores with ⎵ for better visualization
        display_ngram = ngram.replace('_', '⎵')
        print(f"    {i:2d}. '{display_ngram}' ({freq:,} occurrences)")

# Show model statistics
print(f"\n MODEL STATISTICS:")
print(f"Total languages: {len(identifier.language_profiles)}")
print(f"N-gram size: {identifier.n_gram_size}")
print(f"Max features per language: {identifier.max_features}")

total_ngrams = sum(len(profile) for profile in identifier.language_profiles.values())
print(f"Total unique n-grams across all languages: {total_ngrams:,}")

# Calculate overlap between language profiles
print(f"\n N-GRAM OVERLAP ANALYSIS:")
languages = list(identifier.language_profiles.keys())
for i, lang1 in enumerate(languages):
    for lang2 in languages[i+1:]:
        set1 = set(identifier.language_profiles[lang1].keys())
        set2 = set(identifier.language_profiles[lang2].keys())
        overlap = len(set1.intersection(set2))
        total_unique = len(set1.union(set2))
        overlap_pct = (overlap / total_unique) * 100 if total_unique > 0 else 0
        print(f"   {lang1}-{lang2}: {overlap} shared n-grams ({overlap_pct:.1f}% overlap)")


 Top 15 n-grams for ZH:
     1. '⎵⎵我' (199 occurrences)
     2. '⎵⎵你' (77 occurrences)
     3. '了⎵⎵' (63 occurrences)
     4. '⎵⎵他' (62 occurrences)
     5. '的⎵⎵' (33 occurrences)
     6. '⎵⎵她' (32 occurrences)
     7. '嗎⎵⎵' (31 occurrences)
     8. '⎵⎵這' (26 occurrences)
     9. '⎵⎵这' (25 occurrences)
    10. '⎵⎵汤' (24 occurrences)
    11. '⎵汤姆' (24 occurrences)
    12. '人⎵⎵' (20 occurrences)
    13. '⎵⎵那' (19 occurrences)
    14. '⎵我們' (18 occurrences)
    15. '⎵我们' (17 occurrences)

 Top 15 n-grams for DE:
     1. 'en ' (580 occurrences)
     2. 'ich' (510 occurrences)
     3. 'er ' (419 occurrences)
     4. 'ch ' (404 occurrences)
     5. 'ein' (360 occurrences)
     6. 'ie ' (291 occurrences)
     7. 'n⎵⎵' (287 occurrences)
     8. 'sch' (245 occurrences)
     9. 'cht' (232 occurrences)
    10. 'st ' (230 occurrences)
    11. 'en⎵' (228 occurrences)
    12. 'in ' (217 occurrences)
    13. ' ge' (213 occurrences)
    14. ' de' (206 occurrences)
    15. ' ei' (200 occurrences)

 To

## 10) Save the Trained Model

To avoid retraining each time, we save the trained model as a JSON file.  
The file includes the n-gram settings and the frequency profiles for all trained languages.


In [14]:
def save_model(identifier, filepath: str):
    if not identifier.trained:
        raise ValueError("Cannot save untrained model")
    
    model_data = {
        'n_gram_size': identifier.n_gram_size,
        'max_features': identifier.max_features,
        'language_profiles': identifier.language_profiles,
        'languages': list(identifier.language_profiles.keys()),
        'total_features': sum(len(profile) for profile in identifier.language_profiles.values())
    }
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(model_data, f, ensure_ascii=False, indent=2)
    
    return model_data

# Save the model
model_file = "language_identification_model.json"
print(f"Saving trained model")

try:
    model_info = save_model(identifier, model_file)
    file_size = os.path.getsize(model_file) / 1024  # Size in KB
    
    print(f"Model saved successfully!")
    print(f"File: {model_file}")
    print(f"Size: {file_size:.1f} KB")
    print(f"Languages: {', '.join(model_info['languages'])}")
    print(f"Total features: {model_info['total_features']:,}")
    
except Exception as e:
    print(f"Error saving model: {e}")

Saving trained model
Model saved successfully!
File: language_identification_model.json
Size: 73.3 KB
Languages: zh, de, en, fi, fr, it, ja, ko, pl, pt, ru, es, tr
Total features: 3,900


## 11) Load a Saved Model

Load a previously saved model (from a JSON file).  
You can use it in a new session without retraining everything from scratch.


In [18]:
def load_saved_model(filepath: str) -> LanguageIdentifier:
    print(f"Loading model from {filepath}")
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            model_data = json.load(f)
        
        # Create new identifier instance
        loaded_identifier = LanguageIdentifier(
            n_gram_size=model_data['n_gram_size'],
            max_features=model_data['max_features']
        )
        
        # Load the language profiles
        loaded_identifier.language_profiles = model_data['language_profiles']
        loaded_identifier.trained = True
        
        print(f"Model loaded successfully!")
        print(f"Languages: {', '.join(model_data.get('languages', []))}")
        print(f"Total features: {model_data.get('total_features', 'Unknown'):,}")
        
        return loaded_identifier
        
    except FileNotFoundError:
        print(f"Model file '{filepath}' not found!")
        return None
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# load the model
loaded_model = load_saved_model("language_identification_model.json")
if loaded_model:
    test_text = "This is a test sentence."
    result = loaded_model.identify_language(test_text)
    print(f"Test result: {result}")

Loading model from language_identification_model.json
Model loaded successfully!
Languages: zh, de, en, fi, fr, it, ja, ko, pl, pt, ru, es, tr
Total features: 3,900
Test result: en
The model loading function is ready!
