In [5]:
import pandas as pd

df = pd.read_csv("all_data.csv")

In [6]:
# BOW
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Lang"], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_train_count.shape

(7920, 45581)

In [7]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
X_train_tfidf.shape

(7920, 45581)

In [8]:
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", MultinomialNB()),
    ]
)

text_clf.fit(X_train, y_train)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix

categories = df["Lang"].unique().tolist()

# Predict the test set results
y_pred = text_clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=categories))

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

         ARA       0.97      0.18      0.30       188
         CHI       0.72      0.41      0.52       194
         FRE       0.37      0.81      0.51       160
         GER       0.41      0.90      0.56       188
         HIN       0.54      0.71      0.61       177
         ITA       0.74      0.60      0.66       180
         JPN       0.84      0.28      0.43       186
         KOR       0.61      0.48      0.54       170
         SPA       0.55      0.40      0.46       184
         TEL       0.81      0.57      0.67       178
         TUR       0.50      0.65      0.56       175

    accuracy                           0.54      1980
   macro avg       0.64      0.55      0.53      1980
weighted avg       0.65      0.54      0.53      1980

[[ 34   7  33  34  15   5   3   6  17   5  29]
 [  0  80  31  38   5   1   3  12   3   0  21]
 [  0   1 129  19   0   1   1   0   7   0   2]
 [  0   1   8 169   1   1   0   0   6   0   2]