In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Load the dataset
data = pd.read_csv("language_dataset.csv")  # Replace with your dataset path


In [7]:
# Separate text and language labels
Text = data["Text"]
labels = data["Language"]

In [3]:
data.isnull().sum()

Text        0
Language    0
dtype: int64

In [5]:
data['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [10]:
import re
data_list = []
for text in Text:
    text = re.sub(r'[!@#$(),"%^*?:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

In [None]:
Text = data_list

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Text, labels, test_size=0.33, random_state=42)

In [15]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [16]:
model = MultinomialNB()
model.fit(X_train_features, y_train)

In [17]:
predictions = model.predict(X_test_features)

In [20]:
predictions

array(['Russian', 'Italian', 'English', ..., 'Arabic', 'English',
       'Italian'], dtype='<U10')

In [21]:
y_test

6662       Russian
7362       Italian
765        English
6192       Russian
562        English
           ...    
2631    Portugeese
8570      Sweedish
9082        Arabic
401        English
7632       Italian
Name: Language, Length: 3412, dtype: object

In [22]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [23]:
print(confusion_matrix(y_test, predictions))

[[171   0   0  11   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 124   0   5   2   0   0   0   0   0   0   0   0   0   8   0   0]
 [  0   0 176   8   3   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 486   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   2 336   0   0   0   1   0   0   0   0   1   0   0   0]
 [  0   0   0   6   1 140   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   6   0   0 106   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  10   0   0   0  10   0   0   0   0   0   0   0   0   0]
 [  0   0   0   5   2   0   0   0 228   0   0   0   0   2   0   0   0]
 [  0   0   0   6   0   0   0   0   0 103   0   0   0   0   0   0   0]
 [  0   0   0   5   0   0   0   0   0   0 195   0   0   0   0   0   0]
 [  0   0   0   5   0   0   0   0   0   0   0 231   0   5   0   0   0]
 [  0   0   0  15   0   0   0   0   0   0   0   0 202   0   0   0   0]
 [  0   0   0   5   1   0   0   0   0   0   0   1   0 262   0   0   0]
 [  0 

In [24]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      Arabic       1.00      0.94      0.97       182
      Danish       0.99      0.89      0.94       139
       Dutch       0.99      0.94      0.97       187
     English       0.80      1.00      0.89       486
      French       0.95      0.99      0.97       340
      German       1.00      0.95      0.98       147
       Greek       1.00      0.95      0.97       112
       Hindi       1.00      0.50      0.67        20
     Italian       1.00      0.96      0.98       237
     Kannada       1.00      0.94      0.97       109
   Malayalam       1.00      0.97      0.99       200
  Portugeese       1.00      0.96      0.98       241
     Russian       1.00      0.93      0.96       217
     Spanish       0.97      0.97      0.97       269
    Sweedish       0.96      0.96      0.96       215
       Tamil       1.00      0.98      0.99       155
     Turkish       1.00      0.80      0.89       156

    accuracy              

In [25]:
print(accuracy_score(y_test, predictions))

0.9533997655334114


In [30]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

lang_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

lang_clf.fit(X_train, y_train)

In [31]:
Predictions = lang_clf.predict(X_test)

In [32]:
print(confusion_matrix(y_test, Predictions))

[[177   0   0   0   0   0   0   0   0   0   0   0   4   0   0   0   1]
 [  0 129   0   0   1   1   0   0   1   0   0   0   2   0   5   0   0]
 [  0   0 179   0   1   0   0   0   0   0   0   0   4   3   0   0   0]
 [  0   0   0 480   0   0   0   0   0   0   0   0   5   1   0   0   0]
 [  0   0   1   0 334   0   0   0   1   0   0   0   3   1   0   0   0]
 [  0   1   0   0   0 143   0   0   0   0   0   0   3   0   0   0   0]
 [  0   0   0   1   0   0 107   0   0   0   0   0   4   0   0   0   0]
 [  0   0   0   0   0   0   0  20   0   0   0   0   0   0   0   0   0]
 [  0   1   0   0   0   0   0   0 231   0   0   0   3   2   0   0   0]
 [  0   0   0   0   0   0   0   0   0 107   0   0   2   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 195   0   4   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0 231   5   4   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 217   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0   0   0   1   5 262   0   0   0]
 [  0 

In [33]:
print(classification_report(y_test, Predictions))

              precision    recall  f1-score   support

      Arabic       1.00      0.97      0.99       182
      Danish       0.96      0.93      0.95       139
       Dutch       0.99      0.96      0.98       187
     English       0.99      0.99      0.99       486
      French       0.99      0.98      0.98       340
      German       0.99      0.97      0.98       147
       Greek       1.00      0.96      0.98       112
       Hindi       1.00      1.00      1.00        20
     Italian       0.99      0.97      0.98       237
     Kannada       1.00      0.98      0.99       109
   Malayalam       1.00      0.97      0.99       200
  Portugeese       1.00      0.96      0.98       241
     Russian       0.79      1.00      0.88       217
     Spanish       0.96      0.97      0.97       269
    Sweedish       0.98      0.96      0.97       215
       Tamil       1.00      0.98      0.99       155
     Turkish       0.99      0.94      0.97       156

    accuracy              

In [34]:
print(accuracy_score(y_test, Predictions))

0.9721570926143025


In [35]:
# Arabic
lang_clf.predict(["مرحبا كيف حالك؟"])

array(['Arabic'], dtype=object)

In [37]:
lang_clf.predict(["how are you"])

array(['English'], dtype=object)

In [39]:
#French
lang_clf.predict(["Comment vas-tu"])

array(['French'], dtype=object)