# Imports and Dependencies

In [1]:
import string
import re
import pandas as pd
import json
import numpy as np
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
import scipy

In [2]:
df = pd.read_csv('./dataset/Language Detection.csv')
# df = pd.read_csv('./archive/dataset.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [3]:
X = df['Text']
y= df['Language']

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
df_list = []
for text in X:
     
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    df_list.append(text)   

df_list[:5]

['nature, in the broadest sense, is the natural, physical, material world or universe.',
 '"nature" can refer to the phenomena of the physical world, and also to life in general.',
 'the study of nature is a large, if not the only, part of science.',
 'although humans are part of nature, human activity is often understood as a separate category from other natural phenomena.',
 '[1] the word nature is borrowed from the old french nature and is derived from the latin word natura, or "essential qualities, innate disposition", and in ancient times, literally meant "birth".']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=9000, dtype='int32')
X = cv.fit_transform(df_list) 
print(X.shape) 
len(df_list)

(10337, 9000)


10337

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 41)

In [8]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear') 
y_pred = model.fit(X_train, y_train).predict(X_test)

In [9]:
from sklearn.metrics import f1_score, confusion_matrix
ac = f1_score(y_test, y_pred, average = 'macro')

# Prediction function for a given piece of text

In [37]:
def prediction(text, show_all_percentages=True, top_n=6):
    x = cv.transform([text]).toarray() 
    lang = model.predict(x)
    
    # Get prediction probabilities
    probabilities = model.predict_proba(x)[0]
    
    # Get the confidence percentage for the predicted language
    predicted_lang_index = lang[0]
    confidence_percentage = probabilities[predicted_lang_index] * 100

    print('The text:', text)
    print(f'Predicted language: {lang[0]} with {confidence_percentage:.1f}% confidence')
    
    if show_all_percentages:
        print('All language percentages:')
        # Get all language names and their probabilities
        all_languages = le.classes_
        language_probs = [(all_languages[i], probabilities[i] * 100) for i in range(len(all_languages))]
        
        # Sort by probability (highest first)
        language_probs.sort(key=lambda x: x[1], reverse=True)
        
        # Show top N or all languages
        languages_to_show = language_probs if not top_n else language_probs[:top_n]
        
        for lang_name, prob in languages_to_show:
            print(f'  {lang_name}: {prob:.1f}%')
    
    print('-' * 50)

In [38]:
print("Original language labels:", df['Language'].unique()) 

Original language labels: ['English' 'Malayalam' 'Hindi' 'Tamil' 'Portugeese' 'French' 'Dutch'
 'Spanish' 'Greek' 'Russian' 'Danish' 'Italian' 'Turkish' 'Sweedish'
 'Arabic' 'German' 'Kannada']


In [39]:
testSentences = ['Je voudrais une tasse de café, sil vous plaît.','De-acuma nu te-oi mai vedea, Rămâi, rămâi, cu bine! Mă voi feri în calea mea De tine.','நான் ஒரு புதிய மொழியை கற்றுக்கொண்டு இருக்கிறேன். இது மிகவும் விருப்பமானதாக உள்ளது.','मैं एक नई भाषा सीख रहा हूँ। यह बहुत रोचक है।', 'أنا أتعلم لغة جديدة. إنها ممتعة جدًا.','bu yüzden bu, bir sohbetin ortasındayken yine neredeydik pratik yapmanın harika bir yolu?','Я изучаю новый язык. Это очень интересно.']
for language in testSentences:
    prediction(language)

The text: Je voudrais une tasse de café, sil vous plaît.
Predicted language: 4 with 89.3% confidence
All language percentages:
  French: 89.3%
  Dutch: 6.9%
  Portugeese: 0.7%
  Spanish: 0.4%
  Turkish: 0.4%
  Sweedish: 0.3%
--------------------------------------------------
The text: De-acuma nu te-oi mai vedea, Rămâi, rămâi, cu bine! Mă voi feri în calea mea De tine.
Predicted language: 13 with 38.7% confidence
All language percentages:
  Spanish: 38.7%
  Dutch: 28.5%
  French: 17.5%
  Portugeese: 7.9%
  Turkish: 1.8%
  Danish: 1.4%
--------------------------------------------------
The text: நான் ஒரு புதிய மொழியை கற்றுக்கொண்டு இருக்கிறேன். இது மிகவும் விருப்பமானதாக உள்ளது.
Predicted language: 15 with 86.1% confidence
All language percentages:
  Tamil: 86.1%
  Russian: 1.2%
  Turkish: 1.2%
  Arabic: 1.0%
  Spanish: 1.0%
  Kannada: 1.0%
--------------------------------------------------
The text: मैं एक नई भाषा सीख रहा हूँ। यह बहुत रोचक है।
Predicted language: 7 with 65.7% confidence


In [36]:
prediction('Ik leer Nederlands en ik vind het leuk.')

The text: Ik leer Nederlands en ik vind het leuk.
Predicted language: 2 with 94.2% confidence
All language percentages:
  Dutch: 94.2%
  Spanish: 0.8%
  Sweedish: 0.8%
  Danish: 0.5%
  Turkish: 0.4%
--------------------------------------------------
