In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from morfeusz2 import Morfeusz

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from datetime import datetime

In [2]:
# reading DataFrame from pickle
with open('../data/newsData.pickle', 'rb') as pickleInput:
    articlesData = pickle.load(pickleInput)
    
articlesData.head()

Unnamed: 0_level_0,articleUrl,category,publicationTime,articleContent,articleLength
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,https://www.rmf24.pl/fakty/polska/news-piontko...,kraj,2019-08-26T18:37:00,Szef MEN odniósł się do informacji podanych p...,2025
2,https://www.rmf24.pl/fakty/polska/news-rzeczni...,kraj,2019-08-26T17:47:28,"Jak podkreślił Müller, Janusz Wojciechowski to...",983
3,https://www.rmf24.pl/fakty/polska/news-kto-bed...,kraj,2019-08-26T17:20:00,Z nieoficjalnych rozmów reportera RMF FM z prz...,884
4,https://www.rmf24.pl/fakty/polska/news-mieszka...,kraj,2019-08-26T17:09:20,Powód to groźne dla zdrowia bakterie - enter...,583
5,https://www.rmf24.pl/fakty/polska/news-ponad-3...,kraj,2019-08-26T17:01:00,"Filip, Tymon, Zosia, Kaja, Malwina i Nela to p...",746


In [3]:
# article transformations
morf = Morfeusz()

def morfLemats(text, morf):
    analysis = morf.analyse(text)
    lemats = []
    lineCount = 0
    for line in analysis:
        if line[0] == lineCount:
            lineCount += 1
            lemats.append(line[2][1].split(":")[0])
            
    return lemats

def textProcess(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Change words to lemmats
    3. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # remove all punctuation
    mess = " ".join(RegexpTokenizer(r'\w+').tokenize(mess))

    # change words to lemmats
    mess = morfLemats(mess, morf)
    
    # Now just remove any stopwords
    mess = [word for word in mess if word not in stopwords.words("polish")]
    
    return " ".join(mess)

In [4]:
articlesData["articleContent"] = articlesData["articleContent"].apply(textProcess)

# training and testing set
xTrain, xTest, yTrain, yTest = train_test_split(articlesData["articleContent"], 
                                                    articlesData["category"], 
                                                    test_size = 0.2)

In [5]:
def testPipeline(pipeline):
    print(pipeline)
    print()
    
    funcTime = datetime.now()
    pipeline.fit(xTrain, yTrain)
    print("Training took: ", datetime.now() - funcTime)
    
    funcTime = datetime.now()
    yPred = pipeline.predict(xTest)
    print("Prediction took: ", datetime.now() - funcTime)
    
    print("Accuracy: ", accuracy_score(yTest, yPred))
    print()
    print("Confusion matrix:\n", confusion_matrix(yTest, yPred))
    print()
    print("Classification report:\n", classification_report(yTest, yPred))

In [6]:
# building simple vectorizer (bag of words) + classifier
for classifier in [MultinomialNB(), 
                   RandomForestClassifier(n_estimators = 200, n_jobs = -1 ), 
                   SVC(C = 1)]:
    
    pipeline = Pipeline([("BOW", CountVectorizer()), 
                         ("cls", classifier) 
                        ])
    
    testPipeline(pipeline)

Pipeline(memory=None,
     steps=[('BOW', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

Training took:  0:00:01.985190
Prediction took:  0:00:00.495949
Accuracy:  0.8299562971791815

Confusion matrix:
 [[339   9   2   6   1   7  12]
 [ 39 348  10  14   3   3  16]
 [  6   6 352   6   9   1   9]
 [  7   2  12 363   1   3   2]
 [  4  14  71  12  25  17  12]
 [  2  10   2   3   1 372   6]
 [ 23  38   7  10   1   9 290]]

Classification report:
               precision    recall  f1-score   support

    ekonomia       0.81      0.90      0.85       376
        kraj       0.81      0.80      0.81 



Training took:  0:03:09.454951
Prediction took:  0:00:27.767245
Accuracy:  0.36829558998808104

Confusion matrix:
 [[100   0   0   2   0   2 272]
 [ 14  15   2   2   0   2 398]
 [  3   1 109   2   0   0 274]
 [  2   2   0 156   0   0 230]
 [  0   1  11   2   0   0 141]
 [  4   0   0   1   0 176 215]
 [  3   1   0   0   0   3 371]]

Classification report:
               precision    recall  f1-score   support

    ekonomia       0.79      0.27      0.40       376
        kraj       0.75      0.03      0.07       433
     kultura       0.89      0.28      0.43       389
       nauka       0.95      0.40      0.56       390
    rozrywka       0.00      0.00      0.00       155
       sport       0.96      0.44      0.61       396
       świat       0.20      0.98      0.33       378

   micro avg       0.37      0.37      0.37      2517
   macro avg       0.65      0.34      0.34      2517
weighted avg       0.71      0.37      0.37      2517



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [7]:
# version with TF-IDF
for classifier in [MultinomialNB(), 
                   RandomForestClassifier(n_estimators = 200, n_jobs = -1 ), 
                   SVC(C = 1)]:
    
    pipeline = Pipeline([("BOW", CountVectorizer(analyzer = textProcess)), 
                         ("TF-IDF", TfidfTransformer()),
                         ("cls", classifier) 
                        ])
    
    testPipeline(pipeline)

Pipeline(memory=None,
     steps=[('BOW', CountVectorizer(analyzer=<function textProcess at 0x7fbcaf78d730>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=Non...inear_tf=False, use_idf=True)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

Training took:  0:05:22.378218
Prediction took:  0:01:20.170866
Accuracy:  0.3635280095351609

Confusion matrix:
 [[230  19   3   0   0  26  98]
 [135 154   7   0   0  24 113]
 [ 64  19  84   0   0  73 149]
 [ 92  33   7   0   0   9 249]
 [ 15  13  32   0   0  23  72]
 [ 60  19   2   0   0 200 115]
 [ 78  32   2   1   0  18 247]]

Classification report:
               precision    recall  f1-score   support

    ekonomia       0.34      0.61      0.44       376
        kraj       0.53      0.36      0.43       433
     kultura       0.61      0.22      0.32    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Training took:  0:05:32.921631
Prediction took:  0:01:20.143461
Accuracy:  0.5915772745331744

Confusion matrix:
 [[207  48  12  29   0  30  50]
 [ 52 302  11  16   1  16  35]
 [ 14  35 234  20  17  43  26]
 [ 34  42  20 247   2  19  26]
 [ 10  24  57  11  10  24  19]
 [  9  28  21  11   2 298  27]
 [ 39  85  18  22   0  23 191]]

Classification report:
               precision    recall  f1-score   support

    ekonomia       0.57      0.55      0.56       376
        kraj       0.54      0.70      0.61       433
     kultura       0.63      0.60      0.61       389
       nauka       0.69      0.63      0.66       390
    rozrywka       0.31      0.06      0.11       155
       sport       0.66      0.75      0.70       396
       świat       0.51      0.51      0.51       378

   micro avg       0.59      0.59      0.59      2517
   macro avg       0.56      0.54      0.54      2517
weighted avg       0.58      0.59      0.58      2517

Pipeline(memory=None,
     steps=[('BOW', Coun



Training took:  0:05:57.158683
Prediction took:  0:01:25.700916
Accuracy:  0.14938418752483115

Confusion matrix:
 [[376   0   0   0   0   0   0]
 [433   0   0   0   0   0   0]
 [389   0   0   0   0   0   0]
 [390   0   0   0   0   0   0]
 [155   0   0   0   0   0   0]
 [396   0   0   0   0   0   0]
 [378   0   0   0   0   0   0]]

Classification report:
               precision    recall  f1-score   support

    ekonomia       0.15      1.00      0.26       376
        kraj       0.00      0.00      0.00       433
     kultura       0.00      0.00      0.00       389
       nauka       0.00      0.00      0.00       390
    rozrywka       0.00      0.00      0.00       155
       sport       0.00      0.00      0.00       396
       świat       0.00      0.00      0.00       378

   micro avg       0.15      0.15      0.15      2517
   macro avg       0.02      0.14      0.04      2517
weighted avg       0.02      0.15      0.04      2517



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
#TODO Word2Vec representation ;)