# <span style="background-color: #EB824F; padding: 10px"><strong>        Language Detection     </strong></span>

# import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# <span style="background-color: #FFFFFF; padding: 10px"><strong>**LOADING DATASET**</strong></span>

In [2]:
data = pd.read_csv('dataset.csv')
data

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


# 1. Information about dataset

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      22000 non-null  object
 1   language  22000 non-null  object
dtypes: object(2)
memory usage: 343.9+ KB


#  **Observations:**
    
1. Dataset contains 22000 rows and 2 columns.
2. There are 2 columns which are is object datatype.

#  2. check null values

In [4]:
#first check null values
data.isnull().sum().to_frame().rename(columns={0:'No.of null values'})

Unnamed: 0,No.of null values
Text,0
language,0


# 3. count total languages in data

In [5]:
data["language"].value_counts()

Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: language, dtype: int64

#  **Observations:**
1. Total 22 Languages in dataset.
2. Each language have 1000 text.

# 4. To check languages Text

In [6]:
data["Text"]

0        klement gottwaldi surnukeha palsameeriti ning ...
1        sebes joseph pereira thomas  på eng the jesuit...
2        ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...
3        விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...
4        de spons behoort tot het geslacht haliclona en...
                               ...                        
21995    hors du terrain les années  et  sont des année...
21996    ใน พศ  หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...
21997    con motivo de la celebración del septuagésimoq...
21998    年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...
21999     aprilie sonda spațială messenger a nasa și-a ...
Name: Text, Length: 22000, dtype: object

# <span style="background-color: #EB824F; padding: 10px"><strong>**Separating Indepenent (x) and Dependent Variable (y)**</strong></span>

In [7]:
#to separate input and output variable
x = np.array(data["Text"])
y = np.array(data["language"])

# <span style="background-color: #EB824F; padding: 10px"><strong>**Splitting Data for Model Training and testing:**</strong></span>

In [8]:
#CountVectorizer is a great tool provided by the scikit-learn library in Python. 
#It is used to transform a given text into a vector on the basis of the frequency (count) 
#of each word that occurs in the entire text. 
#This is helpful when we have multiple such texts, and 
#we wish to convert each word in each text into vectors (for using in further text analysis).
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=42)

# <span style="background-color: #EB824F; padding: 10px"><strong>**MODEL BUILDING:**</strong></span>

# 1. Multinomial Naïve Bayes algorithm 

In [9]:
#multiclass classification, so I will be using the Multinomial Naïve Bayes algorithm 
#to train the language detection model as this algorithm always performs very well on the problems
#based on multiclass classification:
model = MultinomialNB()
model.fit(X_train,y_train)
Y_pred=model.predict(X_test)

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print(classification_report(y_test,Y_pred))
#confusion matrix
print(confusion_matrix(y_test,Y_pred))
model.score(X_test,y_test)

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       341
     Chinese       0.93      0.50      0.65       317
       Dutch       0.99      0.99      0.99       346
     English       0.70      1.00      0.82       333
    Estonian       0.99      0.96      0.97       338
      French       0.95      0.99      0.97       324
       Hindi       1.00      0.98      0.99       341
  Indonesian       0.99      0.97      0.98       318
    Japanese       0.69      0.87      0.77       328
      Korean       1.00      0.98      0.99       325
       Latin       0.99      0.91      0.95       346
     Persian       1.00      1.00      1.00       323
   Portugese       1.00      0.96      0.98       318
      Pushto       1.00      0.97      0.98       328
    Romanian       0.99      0.98      0.99       324
     Russian       0.99      0.99      0.99       332
     Spanish       0.97      0.99      0.98       322
     Swedish       0.99    

0.953168044077135

# Multinomial Naïve Bayes algorithm  is our final model as its accuracy is good which is 95% Accuracy Score

# check the model using user input text

In [10]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = model.predict(data)
print(output)

Enter a Text: sertyghgvh
['Chinese']


# Deployment of model using Tkinter windows based application Library of python

In [11]:
# Create the main window
import tkinter as tk


def language():
    data=(entry_data.get())
     #To change list in numpy array (2D)
    import numpy as np
    data=np.array([data]) #[[]] means 2D
    #Apply StandardScaler n input exp
    X=cv.transform(data)
    
    try:
        prediction = model.predict(X)[0]
        result_label.config(font =("Courier", 14),text=f"Detected Language is {prediction}")
    except Exception as e:
        result_label.config(text="Error: " + str(e))




def reset_fields():
    entry_data.delete(0, tk.END)

    
    
root = tk.Tk()
root.geometry("600x500")
root.title("Language Detection")




# Create labels and entry widgets for user input
label_data = tk.Label(root, text="Enter a Text ",width=500)
label_data.config(font =("Courier", 14))
label_data.pack(pady=20)

entry_data = tk.Entry(root,width=60)
entry_data.pack(pady=10)


# Create a button to trigger the prediction
predict_button = tk.Button(root, text="Detect Language ",bg="yellow", command=language,width=20, height=2)
predict_button.pack(pady=20)


# Create a button to reset the fields
reset_button = tk.Button(root, text="Reset", bg="red",command=reset_fields,width=10, height=1)
reset_button.pack(pady=10)

# Create a label to display the prediction result
result_label = tk.Label(root, text="")
result_label.pack(pady=30)



root.mainloop()