# Language Detection Model


In [1]:
from google.colab import drive
drive.mount("/content/gdrive/")

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [2]:
import numpy as np
import glob
import pandas as pd

In [3]:
# Load the dataset
df=pd.read_csv("/content/gdrive/MyDrive/GSSOC-ML-CaPsule/Language Detection.csv")
df.head(2)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English


In [4]:
X=df['Text']
y=df['Language']

In [5]:
#encode nominal labels into categorical labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [6]:
import re
processed_list = []
for content in X:
       # removing the symbols and numbers using regex
        content = re.sub(r'[!@#$().,"%^*?:;~`0-9]', ' ', content)
        content = re.sub(r'\[', ' ', content)
        content = re.sub(r'\]', ' ', content)
        content=content.strip()

        # Normalize by converting the content to lower case
        content = content.lower()
        # appending to the processed list
        processed_list.append(content)

processed_list[0:2]

['nature  in the broadest sense  is the natural  physical  material world or universe',
 'nature  can refer to the phenomena of the physical world  and also to life in general']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

def word_tokenizer(phrase):
  words=phrase.split(" ")
  wordlist=[]
  i=0
  for w in words:
    res=w.strip()
    if(res!=''):
      wordlist.append(res)
      i+=1
  return wordlist


In [8]:
phrase_count=len(processed_list)
tokenized_docset=[]
i=0
while(i<phrase_count):
  wset=word_tokenizer(processed_list[i])
  tokenized_docset.append(wset)
  #print(wset)
  i+=1
print(len(tokenized_docset))

10337


In [9]:
#create vocabulary
vocabulary=[]
for arr in tokenized_docset:
  for w in arr:
    if(w not in vocabulary):
      vocabulary.append(w)

len(vocabulary)

48270

In [10]:
#create a dictionary of vocabulary 
vocab_size=len(vocabulary)
val=np.arange(0,vocab_size,1)
word_dic= dict(zip(vocabulary, val))
#word_dic['टारण्टीनो']

In [11]:
#count vector creation (Unigrams)
count_vec=[]
i=0
while(i<phrase_count):
  count_arr=np.zeros(vocab_size)
  for w in tokenized_docset[i]:
    position=word_dic[w]
    count_arr[position]+=1
  i+=1
  #print("Count Array ",i," : ",count_arr)
  count_vec.append(count_arr)

In [12]:
X=np.array(count_vec)
X.shape

(10337, 48270)

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [14]:
#using Multinomial Naive Bayes for classification
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [15]:
y_pred = model.predict(x_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy of Model :",accuracy*100,"%")

Accuracy of Model : 97.16312056737588 %


In [20]:
#test data output
print("Test Content (Count Vector of the content) : ",x_test[0])
print("Predicted label(Encoded) : ",y_pred[0])
print("Actual Label (Encoded) : ",y_test[0])

Test Content (Count Vector of the content) :  [0. 2. 2. ... 0. 0. 0.]
Predicted label(Encoded) :  3
Actual Label (Encoded) :  3
