In [None]:
from google.colab import drive
drive.mount("/content/gdrive/")

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
import numpy as np
import glob
import pandas as pd

In [None]:
# Load the dataset
df=pd.read_csv("/content/gdrive/MyDrive/GSSOC-ML-CaPsule/Language Detection.csv")
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [None]:
X=df['Text']
y=df['Language']

In [None]:
#encode nominal labels into categorical labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [None]:
import re
processed_list = []
for content in X:
       # removing the symbols and numbers using regex
        content = re.sub(r'[!@#$().,"%^*?:;~`0-9]', ' ', content)
        content = re.sub(r'\[', ' ', content)
        content = re.sub(r'\]', ' ', content)
        content=content.strip()

        # Normalize by converting the content to lower case
        content = content.lower()
        # appending to the processed list
        processed_list.append(content)

processed_list

['nature  in the broadest sense  is the natural  physical  material world or universe',
 'nature  can refer to the phenomena of the physical world  and also to life in general',
 'the study of nature is a large  if not the only  part of science',
 'although humans are part of nature  human activity is often understood as a separate category from other natural phenomena',
 'the word nature is borrowed from the old french nature and is derived from the latin word natura  or  essential qualities  innate disposition   and in ancient times  literally meant  birth',
 'in ancient philosophy  natura is mostly used as the latin translation of the greek word physis  φύσις   which originally related to the intrinsic characteristics that plants  animals  and other features of the world develop of their own accord',
 'the concept of nature as a whole  the physical universe  is one of several expansions of the original notion     it began with certain core applications of the word φύσις by pre-socra

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def word_tokenizer(phrase):
  words=phrase.split(" ")
  wordlist=[]
  i=0
  for w in words:
    res=w.strip()
    if(res!=''):
      wordlist.append(res)
      i+=1
  return wordlist


In [None]:
phrase_count=len(processed_list)
tokenized_docset=[]
i=0
while(i<phrase_count):
  wset=word_tokenizer(processed_list[i])
  tokenized_docset.append(wset)
  #print(wset)
  i+=1
print(len(tokenized_docset))

10337


In [None]:
#create vocabulary
vocabulary=[]
for arr in tokenized_docset:
  for w in arr:
    if(w not in vocabulary):
      vocabulary.append(w)

len(vocabulary)

48270

In [None]:
#create a dictionary of vocabulary 
vocab_size=len(vocabulary)
val=np.arange(0,vocab_size,1)
word_dic= dict(zip(vocabulary, val))
#word_dic['टारण्टीनो']

In [None]:
#count vector creation (Unigrams)
count_vec=[]
i=0
while(i<phrase_count):
  count_arr=np.zeros(vocab_size)
  for w in tokenized_docset[i]:
    position=word_dic[w]
    count_arr[position]+=1
  i+=1
  #print("Count Array ",i," : ",count_arr)
  count_vec.append(count_arr)

In [None]:
X=np.array(count_vec)
X.shape

(10337, 48270)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [None]:
#using Multinomial Naive Bayes for classification
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [None]:
y_pred = model.predict(x_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy of Model :",accuracy*100,"%")

Accuracy of Model : 97.29206963249516 %
