In [1]:
import pandas as pd
import json
import numpy as np

Read the jsonl file.

In [14]:
def open_jsonl(filename):
    sentences = []
    with open(filename,  encoding="utf8") as json_file:
        json_list = list(json_file)

    for json_str in json_list:
        result = json.loads(json_str)
        sentences.append(result)
    
    return pd.DataFrame(sentences)

In [15]:
train = open_jsonl('../train.jsonl')
test = open_jsonl('../test.jsonl')

separate data between the text sentences and the label

In [16]:
X = train['text']
y = train['lang']

split data

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

convert text to token using CountVectorizer

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X_train_counts = cv.fit_transform(X_train)

In [19]:
pd.DataFrame([cv.vocabulary_])

Unnamed: 0,وجہٴ,تسمیہ,یہ,ہے,کہ,مہارجٹ,برادری,کی,ایک,ذات,...,對稱性破缺終止了弱電統一,在低能量狀況,電磁力與弱力變得不一樣,因為傳遞弱力的w及z玻色子的非零質量分別為與,而傳遞電磁力的光子的質量為零,在高能量,w及z玻色子可以很容易地被製成,兩種力變得一樣,izmjenivačima,štetniji
0,1793124,1761198,1811339,1810522,1806047,1788928,1756954,1806141,1755506,1768583,...,1960520,1930099,2151449,1924465,2090015,1937048,1328612,1876539,521168,1391889


calculate the TF-IDF

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_counts)

train the model (Multinomial Naive Bayes)

In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

### Testing using train_test_split

In [23]:
test_sentences = X_test
X_new_counts = cv.transform(test_sentences)
X_new_tfidf = tfidf.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)


In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predicted, target_names=np.unique(y_train)))

              precision    recall  f1-score   support

          af       0.99      1.00      1.00      3936
          az       0.99      1.00      0.99      3933
          bg       1.00      1.00      1.00      4016
          cs       0.99      1.00      1.00      3995
          da       0.99      1.00      0.99      3983
          de       0.99      1.00      0.99      3977
          el       1.00      1.00      1.00      3979
          en       0.89      1.00      0.94      4020
          es       0.99      1.00      1.00      3977
          fi       0.99      1.00      0.99      4039
          fr       0.86      1.00      0.92      3927
          hr       1.00      1.00      1.00      4028
          it       1.00      1.00      1.00      4055
          ko       1.00      1.00      1.00      4036
          nl       1.00      1.00      1.00      3970
          no       1.00      1.00      1.00      4016
          pl       1.00      1.00      1.00      3968
          ru       1.00    

### Testing using the test.jsonl

In [26]:
X_test = test['text']

X_new_counts = cv.transform(X_test)
X_new_tfidf = tfidf.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)


In [33]:
pd.concat([test.text, pd.DataFrame(predicted, columns=['predicted'])], axis=1,)

Unnamed: 0,text,predicted
0,Численность населения агломерации Парижа в 201...,ru
1,Наиболее выдающийся вклад в создание современн...,ru
2,С началом франко-прусской войны 1870—1871 годо...,ru
3,"Основное преимущество офсетных антенн в том, ч...",ru
4,"Выручка от реализации продукции, товаров, рабо...",ru
...,...,...
99133,Under den tyske invasionen af Norge i 1940 ble...,da
99134,Verdens ældste eksisterende marineinfanterikor...,da
99135,Ved folketingsvalget i 2011 var Lone Loklindt ...,da
99136,Omøgade er ca. 400 meter lang og er beliggende...,da
