In [100]:
import xml.etree.ElementTree as ET
import re
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [15]:
def readtags(doc):
    tree = ET.parse(doc)
    root = tree.getroot()
    name = doc.replace('.xml','')
    name = []
    for description in root.iter('seg'):
        p = description.text
        p=p.replace('\t','')
        p=p.replace('\n','')
        p = [p]
        name.append(p)
    return name

def readother(doc):
    tree = ET.parse(doc)
    root = tree.getroot()
    name = doc.replace('.xml','')
    name = []
    for description in root.iter('seg'):
        p = [description.text]
        name.append(p)
    return name

In [16]:
chinese = readtags('Chinese.xml')
english = readtags('English.xml')
arabic = readtags('Arabic.xml')

In [17]:
for i in range(len(english)):
    english[i] = [word.lower() for word in english[i] if re.match('^[a-zA-Z]+', word)]
    english[i] = (' ').join(english[i])
    english[i] = [(re.sub(r'[^\w\s]','',english[i]))]
    english[i] = (' ').join(english[i])
df = pd.DataFrame()
lan = ["English"] * len(english)
data_1 = pd.DataFrame({"Text": english,"Language": lan})
df.append(data_1)
data_1.head(1)
for i in range(len(arabic)):
    arabic[i] = (' ').join(arabic[i]).replace('.','')
df = pd.DataFrame()
lan = ["Arabic"] * len(arabic)
data_2 = pd.DataFrame({"Text": arabic,"Language": lan})
df.append(data_2)
data_2.head(1)
for i in range(len(chinese)):
    chinese[i] = ('').join(chinese[i])
    chinese[i] = (re.sub(r'[^\w\s]','',chinese[i]))
df = pd.DataFrame()
lan = ["Chinese"] * len(chinese)
data_5 = pd.DataFrame({"Text": chinese,"Language": lan})
df.append(data_5)
data_5.head(1)
data = pd.concat([data_1, data_2,data_5], ignore_index=True)

In [23]:
data = shuffle(data)
data.head()

Unnamed: 0,Language,Text
25748,English,and he said unto him well thou good servant be...
28455,English,it is reported commonly that there is fornicat...
63061,Chinese,拉 班對 他 說 我 若 在 你 眼前 蒙恩 請 你 仍與 我 同住 因為 我 已 算...
42033,Arabic,وكانت ايضا حرب مع الفلسطينيين فقتل الحانان بن ...
34294,Arabic,وان اضطجع معها رجل فكان طمثها عليه يكون نجسا س...


# Bag of words based Probabilistic model

In [148]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
train = data['Text'][:9000]
test =  data['Text'][3305:]
X_train_counts = count_vect.fit_transform(train)
test = count_vect.transform(test)
X_train_counts.shape

(9000, 28450)

In [149]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_counts, data['Language'][:9000])

In [150]:
predicted = clf.predict(test)
np.mean(predicted == data['Language'][3305:])

0.9980666666666667

In [151]:
s = count_vect.transform(["god is"])

### Probability that the text is ARABIC, CHINESE, ENGLISH

In [153]:
clf.predict_proba(s)

array([[7.68671446e-06, 1.32082217e-05, 9.99979105e-01]])

### If i use a combination of both English and Arabic we can see that the probability of the text being arabic increases whereas the probability if ut being english decreases

In [154]:
s = count_vect.transform(["god is الله"])

- English = 99 %
- Arabic = 0.002 %

In [155]:
clf.predict_proba(s)

array([[2.36333889e-03, 2.50012128e-05, 9.97611660e-01]])

## Now that I use all the three languages the same number of times, the probabilites are comparable 

In [156]:
s = count_vect.transform(["god 神 الله"])

- English = 44 %
- Arabic = 55 %
- Chinese = 0.0005 %

In [157]:
clf.predict_proba(s)

array([[0.50724477, 0.00411136, 0.48864387]])

Chinese predictions are weak

In [158]:
s = count_vect.transform(["拉 班對"])
clf.predict_proba(s)

array([[0.23112025, 0.6085612 , 0.16031855]])

# TFIDF based Probabilistic model

In [140]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(9000, 28450)

In [141]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, data['Language'][:9000])

In [143]:
s = count_vect.transform(["god is"])

In [146]:
clf.predict_proba(s)

array([[2.12596145e-04, 2.39232286e-04, 9.99548172e-01]])

- English = 99.99 %
- Arabic = 0.0008 %

In [127]:
s = count_vect.transform(["god is الله"])
clf.predict_proba(s)c

array([[8.04955296e-04, 3.17871283e-06, 9.99191866e-01]])

- English = 60.302607 %
- Arabic = 39.556258 %
- Chinese = 0.141135 %

In [128]:
s = count_vect.transform(["god 神 الله"])
clf.predict_proba(s)

array([[0.39556258, 0.00141135, 0.60302607]])

In [131]:
s = count_vect.transform(["拉"])
clf.predict_proba(s)

array([[0.33333691, 0.33332619, 0.33333691]])

## Chinese Probability is weak in both models