# Imports and Dependencies

In [4]:
import string
import re
import pandas as pd
import json
import numpy as np
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
import scipy

In [5]:
df = pd.read_csv('./archive/dataset.csv')
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [6]:
X = df['Text']
y= df['language']

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
df_list = []
for text in X:
     
    text = re.sub(r'([^a-zA-Z0-9\s])', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()  
    text = text.lower()
    df_list.append(text)   

df_list[:5]

['klement gottwaldi surnukeha palsameeriti ning paigutati mausoleumi surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundem rke aastal viidi ta surnukeha mausoleumist ra ja kremeeriti zl ni linn kandis aastatel nime gottwaldov ukrainas harkivi oblastis kandis zmiivi linn aastatel nime gotvald',
 'sebes joseph pereira thomas p eng the jesuits and the sino russian treaty of nerchinsk the diary of thomas pereira bibliotheca instituti historici s i rome libris',
 'thanon charoen krung',
 '',
 'de spons behoort tot het geslacht haliclona en behoort tot de familie chalinidae de wetenschappelijke naam van de soort werd voor het eerst geldig gepubliceerd in door kudelin']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, dtype='int32')
X = cv.fit_transform(df_list[:22000]) 
print(X.shape) 
len(df_list)

(22000, 5000)


22000

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 41)

In [11]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')  # 'liblinear' or 'saga'
y_pred = model.fit(X_train, y_train).predict(X_test)

In [12]:
from sklearn.metrics import f1_score, confusion_matrix
ac = f1_score(y_test, y_pred, average = 'macro')

In [15]:
def prediction(text):
    x = cv.transform([text]).toarray()  # Transform input text
    lang = model.predict(x)   # Use lowercase 'x' (not 'X')
    try:
        decoded = le.inverse_transform(lang)
    except ValueError:
        decoded = lang  # fallback if unseen
    print('The language is:', decoded)



In [18]:
prediction('Salutare Baiatul meu tocmai a crescut si trebuie sa se angajeze la firma dumneavoastră')

The language is: ['Urdu']


In [20]:
prediction('Je mens en militant avec laide de Belzébuth')

The language is: ['Japanese']


In [21]:
print("Original language labels:", df['language'].unique())  # Should show ['en', 'fr', 'de', ...]

Original language labels: ['Estonian' 'Swedish' 'Thai' 'Tamil' 'Dutch' 'Japanese' 'Turkish' 'Latin'
 'Urdu' 'Indonesian' 'Portugese' 'French' 'Chinese' 'Korean' 'Hindi'
 'Spanish' 'Pushto' 'Persian' 'Romanian' 'Russian' 'English' 'Arabic']
