In [38]:
import numpy as np
import pandas as pd

import pickle
import multiprocessing
from datetime import datetime

from nltk.corpus import stopwords
from morfeusz2 import Morfeusz

from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models.phrases import Phrases
from gensim.utils import simple_preprocess

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
# reading DataFrame from pickle
with open('../data/newsData.pickle', 'rb') as pickleInput:
    articlesData = pickle.load(pickleInput)
    
articlesData.head()

Unnamed: 0_level_0,articleUrl,category,publicationTime,articleContent,articleLength
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,https://www.rmf24.pl/fakty/polska/news-piontko...,kraj,2019-08-26T18:37:00,Szef MEN odniósł się do informacji podanych p...,2025
2,https://www.rmf24.pl/fakty/polska/news-rzeczni...,kraj,2019-08-26T17:47:28,"Jak podkreślił Müller, Janusz Wojciechowski to...",983
3,https://www.rmf24.pl/fakty/polska/news-kto-bed...,kraj,2019-08-26T17:20:00,Z nieoficjalnych rozmów reportera RMF FM z prz...,884
4,https://www.rmf24.pl/fakty/polska/news-mieszka...,kraj,2019-08-26T17:09:20,Powód to groźne dla zdrowia bakterie - enter...,583
5,https://www.rmf24.pl/fakty/polska/news-ponad-3...,kraj,2019-08-26T17:01:00,"Filip, Tymon, Zosia, Kaja, Malwina i Nela to p...",746


In [47]:
# article transformation/tokenisation
morf = Morfeusz()

def morfLemats(text, morf):
    analysis = morf.analyse(text)
    lemats = []
    lineCount = 0
    for line in analysis:
        if line[0] == lineCount:
            lineCount += 1
            lemats.append(line[2][1].split(":")[0])
            
    return lemats

def textProcess(mess):
    # remove punctuation
    mess = " ".join(simple_preprocess(mess))

    # change words to lemmats
    mess = morfLemats(mess, morf)
    
    # remove stopwords
    mess = [word for word in mess if word not in stopwords.words("polish")]
    
    return mess

In [52]:
articlesData["articleContent"] = articlesData.progress_apply(lambda article : textProcess(article["articleContent"]), axis = 1)

HBox(children=(IntProgress(value=0, max=12583), HTML(value='')))

In [77]:
articlePhrases = Phrases(tqdm(articlesData["articleContent"]), min_count=1, threshold=1)
articlesData["articleContent"] = articlesData.progress_apply(lambda article : articlePhrases[article["articleContent"]] + article["articleContent"], axis = 1)

HBox(children=(IntProgress(value=0, max=12583), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12583), HTML(value='')))

In [78]:
articlesTagged = articlesData.progress_apply(lambda article: TaggedDocument(words=article['articleContent'], 
                                                                            tags=[article["category"]]), axis=1)

HBox(children=(IntProgress(value=0, max=12583), HTML(value='')))

In [133]:
articlesTrain, articlesTest = train_test_split(articlesTagged, test_size=0.2)

In [134]:
cores = multiprocessing.cpu_count()

In [135]:
articlesModel = Doc2Vec(vector_size=300,min_count = 1, window_size = 10, sample = 0, workers = cores)
articlesModel.build_vocab([word for word in tqdm(articlesTrain.values)])

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

In [136]:
for epoch in range(20):
    articlesModel.train(tqdm(articlesTrain), total_examples=articlesModel.corpus_count, epochs=1)

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

In [137]:
def inferVectors(model, tagged_docs, phrases):
    sents = tqdm(tagged_docs.values)
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(phrases[doc.words], steps=20)) for doc in sents])
    return targets, regressors 

In [138]:
yTrain, xTrain = inferVectors(articlesModel, articlesTrain, articlePhrases)
yTest, xTest = inferVectors(articlesModel, articlesTest, articlePhrases)

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2517), HTML(value='')))

In [139]:
clsModels = [LogisticRegression(n_jobs = cores,  C = 0.001, solver = "sag", multi_class = "auto"), 
            RandomForestClassifier(n_estimators = 1000, n_jobs = -1), 
             SVC(C = 1e5, gamma = "auto")]

In [140]:
for clsModel in tqdm(clsModels):
    print(type(clsModel).__name__)
    
    funcTime = datetime.now()
    clsModel.fit(xTrain, yTrain)
    print("Training took: ", datetime.now() - funcTime)
    
    funcTime = datetime.now()
    yPred = clsModel.predict(xTest)
    print("Prediction took: ", datetime.now() - funcTime)
    
    print("Accuracy: ", accuracy_score(yTest, yPred))
    print()
    print("Confusion matrix:\n", confusion_matrix(yTest, yPred))
    print()
    print("Classification report:\n", classification_report(yTest, yPred))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

LogisticRegression
Training took:  0:00:06.462138
Prediction took:  0:00:00.264695
Accuracy:  0.8263806118394914

Confusion matrix:
 [[345  17   0   3   1   4  16]
 [ 21 333   5   8   2   3  12]
 [  3  13 304   5  37   3   3]
 [ 12   9   7 337   7   3   9]
 [  4  10  53   5  70   6  10]
 [  3  11   1   2   5 382   8]
 [ 19  60   6   8  13  10 309]]

Classification report:
               precision    recall  f1-score   support

    ekonomia       0.85      0.89      0.87       386
        kraj       0.74      0.87      0.80       384
     kultura       0.81      0.83      0.82       368
       nauka       0.92      0.88      0.90       384
    rozrywka       0.52      0.44      0.48       158
       sport       0.93      0.93      0.93       412
       świat       0.84      0.73      0.78       425

   micro avg       0.83      0.83      0.83      2517
   macro avg       0.80      0.79      0.80      2517
weighted avg       0.83      0.83      0.82      2517

RandomForestClassifier
Trai

In [141]:
newText = """
Jak popularyzować wiedzę o kosmosie? Skąd brać na to pieniądze? Między innymi o tym rozmawiali w Olsztynie pasjonaci astronomii.

– Zainteresowanie kosmosem jest wciągające – uważa profesor Marek Sarna z Polskiego Towarzystwa Astronomicznego.
– Space Forum 2019 zgromadziło organizacje i osoby, które starają się zarazić innych swoją pasją związaną z poznawaniem wszechświata – mówi organizator wydarzenia Robert Szaj z Fundacji Nicolaus Copernicus.
Podczas Forum można m.in. poznać zasady pisania wniosków o dofinansowania na projekty związane z popularyzacją wiedzy o kosmosie, omawiane są metody współpracy z mediami i przedstawiane najciekawsze wydarzenia upowszechniające astronomię w Polsce.

W poniedziałek rozpocznie się 39. Zjazd Polskiego Towarzystwa Astronomicznego, który odbywa się w tym roku w Olsztynie. Space Forum organizują Fundacja Nicolaus Copernicus i Fundacja Winnice Północy.
"""

In [142]:
clsModel.predict([articlesModel.infer_vector(articlePhrases[textProcess(newText)], steps=20)])

array(['nauka'], dtype='<U8')

In [143]:
# Not bad! But does it get the context?

In [144]:
articlesModel.wv.most_similar(articlePhrases[["Jarosław"]])

[('Leszek', 0.7429794073104858),
 ('Sławomir', 0.7065836191177368),
 ('Jarosława', 0.6906055212020874),
 ('Mirosław', 0.675957441329956),
 ('Aleksander', 0.6708194017410278),
 ('kołtun', 0.6673728227615356),
 ('Jakubik', 0.66184401512146),
 ('PiS_Ryszard', 0.6575844287872314),
 ('Jakub', 0.6574205756187439),
 ('Henryk', 0.6567180752754211)]

In [145]:
articlesModel.wv.most_similar(articlePhrases[["Jarosław", "Kaczyński"]])

[('klub_Kukiz', 0.8389977216720581),
 ('PSL_Władysław', 0.8230854868888855),
 ('podkreślać_zabójca', 0.8207553625106812),
 ('marszałek_sejm', 0.8206610083580017),
 ('prezes_PiS', 0.8199045658111572),
 ('marek_Kuchciński', 0.8197516202926636),
 ('Robert_Biedroń', 0.8171521425247192),
 ('Kidawa_błoński', 0.813496470451355),
 ('Andrzej_duda', 0.812714159488678),
 ('premier_europosłanka', 0.8105580806732178)]