In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import re

# Data loading and preprocessing
def load_and_preprocess_data(train):
    data = pd.read_csv(train)
    
    data['cleaned_text'] = data['text'].apply(lambda x: x.lower())
    data['cleaned_text'] = data['cleaned_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    
    return data

data_identify = load_and_preprocess_data("train_updated.csv")
x_identify = data_identify["cleaned_text"]
y_identify = data_identify["labels"]

label_encoder = LabelEncoder()
y_identify_encoded = label_encoder.fit_transform(y_identify)

# Feature extraction
# Use TF-IDF with word and character n-grams
tfidf_word = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2),  
    max_features=10000,
    min_df=5
)

tfidf_char = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2, 5),  
    max_features=10000,
    min_df=5
)

X_word = tfidf_word.fit_transform(x_identify)
X_char = tfidf_char.fit_transform(x_identify)

X_combined = np.hstack((X_word.toarray(), X_char.toarray()))

X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y_identify_encoded, test_size=0.2, random_state=42, stratify=y_identify_encoded
)
svm_model = LinearSVC(C=1.0, random_state=42)

# Train SVM model
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_val)
print(f"Validation accuracy (SVM): {accuracy_score(y_val, svm_preds):.4f}")

final_model = svm_model

# Re-train on full training data
final_model.fit(X_combined, y_identify_encoded)

# Load and evaluate on test data
data_identify_test = load_and_preprocess_data("test_updated.csv")
x_identify_test = data_identify_test["cleaned_text"]
y_identify_test = data_identify_test["labels"]
y_identify_test_encoded = label_encoder.transform(y_identify_test)


Validation accuracy (SVM): 0.9949


In [2]:

# Transform test data
X_test_word = tfidf_word.transform(x_identify_test)
X_test_char = tfidf_char.transform(x_identify_test)
X_test_combined = np.hstack((X_test_word.toarray(), X_test_char.toarray()))

# Evaluate on test set
test_preds = final_model.predict(X_test_combined)
accuracy_identify = accuracy_score(y_identify_test_encoded, test_preds)
print(f'Language Identification Model Accuracy: {accuracy_identify:.4f}')
print("\nClassification Report:")
print(classification_report(
    y_identify_test_encoded, 
    test_preds,
    target_names=label_encoder.classes_
))

# Create function for language identification
def identify_language(text):
    # Preprocess input text
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    
    # Transform using both vectorizers
    text_word_features = tfidf_word.transform([cleaned_text])
    text_char_features = tfidf_char.transform([cleaned_text])
    
    # Combine features
    text_combined = np.hstack((text_word_features.toarray(), text_char_features.toarray()))
    
    # Predict
    predicted_label = final_model.predict(text_combined)
    predicted_language = label_encoder.inverse_transform(predicted_label)
    confidence = np.max(final_model.predict_proba(text_combined)) if hasattr(final_model, 'predict_proba') else None
    
    return predicted_language[0], confidence

# Interactive language identification
if __name__ == "__main__":

    user_input_identify = input("\nEnter a text for language identification: ")
        
    predicted_lang, confidence = identify_language(user_input_identify)
    print(f'Identified Language: {predicted_lang}')
    if confidence:
        print(f'Confidence: {confidence:.2f}')

Language Identification Model Accuracy: 0.9931

Classification Report:
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       500
   Bulgarian       1.00      1.00      1.00       500
     Chinese       0.96      1.00      0.98       500
       Dutch       1.00      0.99      0.99       500
     English       1.00      1.00      1.00       500
      French       1.00      1.00      1.00       500
      German       1.00      1.00      1.00       500
       Greek       1.00      1.00      1.00       500
       Hindi       1.00      0.97      0.98       500
     Italian       1.00      0.99      0.99       500
    Japanese       1.00      1.00      1.00       500
      Polish       1.00      1.00      1.00       500
  Portuguese       0.99      0.99      0.99       500
     Russian       1.00      1.00      1.00       500
     Spanish       1.00      1.00      1.00       500
     Swahili       0.97      1.00      0.98       500
        Th