# Imports and Dependencies

In [24]:
import string
import re
import pandas as pd
import json
import numpy as np
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
import scipy

In [25]:
df = pd.read_csv('./dataset/Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [26]:
X = df['Text']
y= df['Language']

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [28]:
df_list = []
for text in X:
     
    text = re.sub(r'([^a-zA-Z0-9\s])', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()  
    text = text.lower()
    df_list.append(text)   

df_list[:5]

['nature in the broadest sense is the natural physical material world or universe',
 'nature can refer to the phenomena of the physical world and also to life in general',
 'the study of nature is a large if not the only part of science',
 'although humans are part of nature human activity is often understood as a separate category from other natural phenomena',
 '1 the word nature is borrowed from the old french nature and is derived from the latin word natura or essential qualities innate disposition and in ancient times literally meant birth']

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, dtype='int32')
X = cv.fit_transform(df_list) 
print(X.shape) 
len(df_list)

(10337, 5000)


10337

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 41)

In [33]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')  # 'liblinear' or 'saga'
y_pred = model.fit(X_train, y_train).predict(X_test)

In [34]:
from sklearn.metrics import f1_score, confusion_matrix
ac = f1_score(y_test, y_pred, average = 'macro')

In [35]:
def prediction(text):
    x = cv.transform([text]).toarray()  # Transform input text
    lang = model.predict(x)   # Use lowercase 'x' (not 'X')
    try:
        decoded = le.inverse_transform(lang)
    except ValueError:
        decoded = lang  # fallback if unseen
    print('The language is:', decoded)



In [38]:
prediction('"Il gatto dormiva tranquillamente sul davanzale mentre fuori pioveva')

The language is: ['Italian']


In [39]:
prediction('The old bicycle creaked as it rolled down the quiet, cobblestone street')

The language is: ['English']


In [46]:
prediction('bu yüzden bu, bir sohbetin ortasındayken yine neredeydik pratik yapmanın harika bir yolu?')

The language is: ['Turkish']


In [45]:
prediction('')

The language is: ['Russian']


In [41]:
print("Original language labels:", df['Language'].unique()) 

Original language labels: ['English' 'Malayalam' 'Hindi' 'Tamil' 'Portugeese' 'French' 'Dutch'
 'Spanish' 'Greek' 'Russian' 'Danish' 'Italian' 'Turkish' 'Sweedish'
 'Arabic' 'German' 'Kannada']


In [74]:
prediction('Ett gammalt fotografi låg gömt i lådan.')

The language is: ['Russian']
