### Import Libraries

In [68]:
import pandas as pd 

#### Load data

In [69]:
lang_data = pd.read_csv("Language Detection.csv")
lang_data

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [70]:
lang_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


#### Data Preprocessing

In [71]:
lang_data["Text"] = lang_data["Text"].str.replace('r[^a-zA-Z\s]', '', regex = True).str.lower()
lang_data["Text"]

0         nature, in the broadest sense, is the natural...
1        "nature" can refer to the phenomena of the phy...
2        the study of nature is a large, if not the onl...
3        although humans are part of nature, human acti...
4        [1] the word nature is borrowed from the old f...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

#### Train Dataset

In [72]:
# Convert text into numbers...
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer()

Transform = CV.fit_transform(lang_data["Text"]).toarray()
print(Transform, "\n")

print(transform.shape)
len(lang_data["Text"])


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] 

(10337, 40077)


10337

In [73]:
from sklearn.model_selection import train_test_split

X = Transform
y = lang_data["Language"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state = 42)

In [74]:
print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))

Training samples: 6202
Testing samples: 4135


In [75]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [76]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
NB = model.fit(X_train, y_train)
NB


In [77]:
y_pred = model.predict(X_test)
y_pred

array(['Russian', 'Italian', 'English', ..., 'English', 'Turkish',
       'Dutch'], dtype='<U10')

In [80]:
from sklearn.metrics import accuracy_score, f1_score, precision_score,recall_score
y_pred = model.predict(X_test)

# Accuracy 
Accuracy = accuracy_score(y_test, y_pred)
print('Acurracy: ', Accuracy.round(2))

# F1 
F1 = f1_score(y_test, y_pred, average = 'weighted')
print('F1 Score: ', F1.round(2))

# Precision
Precision = precision_score(y_test, y_pred, average = 'weighted')
print('Precision: ', Precision.round(2))

# Recall
Recall = recall_score(y_test, y_pred, average = 'weighted')
print('Recall: ', Recall.round(2))

Acurracy:  0.98
F1 Score:  0.98
Precision:  0.98
Recall:  0.98


In [105]:
sample_text = "الذكاء الاصطناعي لديه القدرة على إحداث ثورة في العديد من القطاعات، من الرعاية الصحية إلى التعليم، من خلال أتمتة المهام وتقديم حلول أكثر فعالية."
sample_text_tfidf = CV.transform([sample_text])
predicted_language = model.predict(sample_text_tfidf)
print("Detected Language:", predicted_language[0])


Detected Language: Arabic


In [109]:
import joblib
joblib.dump(NB, 'Language_detector.pkl')
joblib.dump(CV, 'CountVectorizer.pkl')

['CountVectorizer.pkl']

In [None]:
lang_data = pd.read_csv("Language Detection.csv")
lang_data

lang_data.info()

lang_data["Text"] = lang_data["Text"].str.replace('r[^a-zA-Z\s]', '', regex = True).str.lower()
lang_data["Text"]

# Convert text into numbers...
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer()

Transform = CV.fit_transform(lang_data["Text"]).toarray()
print(Transform, "\n")

print(transform.shape)
len(lang_data["Text"])

from sklearn.model_selection import train_test_split

X = Transform
y = lang_data["Language"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state = 42)


from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
NB = model.fit(X_train, y_train)
NB

y_pred = model.predict(X_test)
y_pred

from sklearn.metrics import accuracy_score, f1_score, precision_score,recall_score
y_pred = model.predict(X_test)

# Accuracy 
Accuracy = accuracy_score(y_test, y_pred)
print('Acurracy: ', Accuracy.round(2))

# F1 
F1 = f1_score(y_test, y_pred, average = 'weighted')
print('F1 Score: ', F1.round(2))

# Precision
Precision = precision_score(y_test, y_pred, average = 'weighted')
print('Precision: ', Precision.round(2))

# Recall
Recall = recall_score(y_test, y_pred, average = 'weighted')
print('Recall: ', Recall.round(2))