In [1]:
import numpy as np
import pandas as pd

import pickle
import multiprocessing
from datetime import datetime

from nltk.corpus import stopwords
from morfeusz2 import Morfeusz

from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
# reading DataFrame from pickle
with open('../data/newsData.pickle', 'rb') as pickleInput:
    articlesData = pickle.load(pickleInput)
    
articlesData.head()

Unnamed: 0_level_0,articleUrl,category,publicationTime,articleContent,articleLength
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,https://www.rmf24.pl/fakty/polska/news-piontko...,kraj,2019-08-26T18:37:00,Szef MEN odniósł się do informacji podanych p...,2025
2,https://www.rmf24.pl/fakty/polska/news-rzeczni...,kraj,2019-08-26T17:47:28,"Jak podkreślił Müller, Janusz Wojciechowski to...",983
3,https://www.rmf24.pl/fakty/polska/news-kto-bed...,kraj,2019-08-26T17:20:00,Z nieoficjalnych rozmów reportera RMF FM z prz...,884
4,https://www.rmf24.pl/fakty/polska/news-mieszka...,kraj,2019-08-26T17:09:20,Powód to groźne dla zdrowia bakterie - enter...,583
5,https://www.rmf24.pl/fakty/polska/news-ponad-3...,kraj,2019-08-26T17:01:00,"Filip, Tymon, Zosia, Kaja, Malwina i Nela to p...",746


In [4]:
# article transformation/tokenisation
morf = Morfeusz()

def morfLemats(text, morf):
    analysis = morf.analyse(text)
    lemats = []
    lineCount = 0
    for line in analysis:
        if line[0] == lineCount:
            lineCount += 1
            lemats.append(line[2][1].split(":")[0])
            
    return lemats

def textProcess(mess):
    # remove punctuation
    mess = " ".join(simple_preprocess(mess))

    # change words to lemmats
    mess = morfLemats(mess, morf)
    
    # remove stopwords
    mess = [word for word in mess if word not in stopwords.words("polish")]
    
    return mess

In [5]:
articlesTagged = articlesData.progress_apply(lambda article: TaggedDocument(words=textProcess(article['articleContent']), 
                                                                            tags=[article["category"]]), axis=1)

HBox(children=(IntProgress(value=0, max=12583), HTML(value='')))




In [6]:
articlesTrain, articlesTest = train_test_split(articlesTagged, test_size=0.2)

In [7]:
cores = multiprocessing.cpu_count()

In [8]:
articlesModel = Doc2Vec(vector_size=300, min_count = 1, sample = 0, workers = cores)
articlesModel.build_vocab([word for word in tqdm(articlesTrain.values)])

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




In [9]:
for epoch in range(20):
    articlesModel.train(tqdm(articlesTrain), total_examples=articlesModel.corpus_count, epochs=1)

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




In [10]:
def inferVectors(model, tagged_docs):
    sents = tqdm(tagged_docs.values)
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors 

In [11]:
yTrain, xTrain = inferVectors(articlesModel, articlesTrain)
yTest, xTest = inferVectors(articlesModel, articlesTest)

HBox(children=(IntProgress(value=0, max=10066), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2517), HTML(value='')))




In [20]:
clsModels = [LogisticRegression(n_jobs = cores,  C = 0.001, solver = "sag", multi_class = "auto"), 
            RandomForestClassifier(n_estimators = 1000, n_jobs = -1), 
             SVC(C = 1e5, gamma = "auto")]

In [21]:
for clsModel in tqdm(clsModels):
    print(type(clsModel).__name__)
    
    funcTime = datetime.now()
    clsModel.fit(xTrain, yTrain)
    print("Training took: ", datetime.now() - funcTime)
    
    funcTime = datetime.now()
    yPred = clsModel.predict(xTest)
    print("Prediction took: ", datetime.now() - funcTime)
    
    print("Accuracy: ", accuracy_score(yTest, yPred))
    print()
    print("Confusion matrix:\n", confusion_matrix(yTest, yPred))
    print()
    print("Classification report:\n", classification_report(yTest, yPred))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

LogisticRegression
Training took:  0:00:01.337106
Prediction took:  0:00:00.008674
Accuracy:  0.8112832737385777

Confusion matrix:
 [[343  34   4   1   2   3  24]
 [ 15 308   6   3   4   4  17]
 [  7  18 348   5  26   1   8]
 [  8  16   8 311  13   3  20]
 [  6  11  49   5  56  11  15]
 [  3  20   0   2   5 377  14]
 [ 13  48   4   6   6   7 299]]

Classification report:
               precision    recall  f1-score   support

    ekonomia       0.87      0.83      0.85       411
        kraj       0.68      0.86      0.76       357
     kultura       0.83      0.84      0.84       413
       nauka       0.93      0.82      0.87       379
    rozrywka       0.50      0.37      0.42       153
       sport       0.93      0.90      0.91       421
       świat       0.75      0.78      0.77       383

   micro avg       0.81      0.81      0.81      2517
   macro avg       0.78      0.77      0.77      2517
weighted avg       0.82      0.81      0.81      2517

RandomForestClassifier
Trai

In [32]:
newText = """W Chojnicach Stomil chciał się zrehabilitować za ubiegłotygodniową porażkę z Zagłębiem Sosnowiec, no i udało się to mu w 100 procentach!
Reklama
* Chojniczanka Chojnice - Stomil Olsztyn 1:3 (1:1)
1:0 - Krystian Wachowiak (12), 1:1 - Grzegorz Lech (29), 1:2 - Artur Siemaszko (70), 1:3 - Oktawian Skrzecz (90)

Chojniczanka: Radosław Janukiewicz - Jan Mudra, Seweryn Michalski, Hubert Wołąkiewicz, Kamil Sylwestrzak - Aghwan Papikjan, Michal Obročník, Bartosz Wolski (82 Toni Conejo), Emil Drozdowicz, Krystian Wachowiak (68 Mateusz Żukowski) - Tomasz Mikołajczak (56 Mateusz Kuzimski).
Stomil: Piotr Skiba - Janusz Bucholc, Wiktor Biedrzycki, Ołeh Tarasenko, Lukáš Kubáň - Bartłomiej Niedziela (46 Oktawian Skrzecz), Maciej Pałaszewski, Wojciech Hajda, Grzegorz Lech (76 Waldemar Gancarczyk), Artur Siemaszko (88 Mateusz Gancarczyk) - Szymon Sobczak.

Tydzień temu olsztynianie wygrywali 1:0, by ostatecznie przegrać 1:3, a tym razem role się odwróciły, bo Chojniczanka od 12. min prowadziła 1:0, ale na koniec z wygranej 3:1 cieszył się Stomil.
Warto zauważyć, że wynik meczu ustalił Oktawian Skrzecz, ostatni nabytek olsztyńskiego pierwszoligowca. Przypominamy, że Skrzecz jest wychowankiem Lechii Gdańsk, występował w juniorskich zespołach Schalke 04 Gelsenkirchen, a w sezonie 2017/18 zagrał w 24 spotkaniach I ligi w barwach GKS Katowice. Do Stomilu został wypożyczony z Korony Kielce, w której w tym sezonie dwa razy wystąpił na boiskach Ekstraklasy. """

In [33]:
clsModel.predict([articlesModel.infer_vector(textProcess(newText), steps=20)])

array(['sport'], dtype='<U8')

In [None]:
:D 