# Import

In [17]:
import os
import sys
import dill
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)


ownModules = ['Import.CorpusImporter',
              'Import.Preprocessor',
              'Import.LinguisticVectorizer',
              'Import.NamedEntityVectorizer',
              'Import.CrawlerImporter']

for module in ownModules:
    if module in sys.modules:
        del sys.modules[module]
    
from Import.CorpusImporter import CorpusImporter
from Import.CrawlerImporter import CrawlerImporter
from Import.Preprocessor import Preprocessor
from Import.LinguisticVectorizer import LinguisticVectorizer
from Import.NamedEntityVectorizer import NamedEntityVectorizer

# Pickle Settings

In [2]:
eclf_file = './eclf.pickle'
corpus_file = './corpus.pickle'

# Korpus Import

In [12]:
Collection = []
if os.path.isfile(corpus_file):
    with open(corpus_file, 'rb') as handle:
        Collection = dill.load(handle)
else:
    corpus = CorpusImporter()
    corpus.clearMemory()
    corpus.crawlNYT(per_tag=1001, is_multilabel=False, nytPaths = ["2007","2006","2005","2004","2003","2002","2001","2000","1999","1998","1997","1996","1995","1994"])

    with open('corpus.pickle', 'wb') as handle:
        corpus = dill.dump(corpus._Collection, handle, protocol=dill.HIGHEST_PROTOCOL)
print(str(len(Collection)) + " Artikel eingelesen.")
print(Collection[0].titles)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split([news.text for news in Collection if news.text is not None], [news.tags[0] for news in Collection if news.text is not None], test_size=0.11, random_state=42)
print(len(X_train), " Newspaper Articles in the Training Set")
print(len(X_test), " Newspaper Articles in the Test Set")

9006 Artikel eingelesen.
Categorized, Compared and Displayed: Social Ills as Museum Specimens
        
8010  Newspaper Articles in the Training Set
991  Newspaper Articles in the Test Set


# Crawler-Korpus

In [19]:
crawler = CrawlerImporter()
crawler.importAllFromDB()

In [39]:
crawler_text = [item.text for item in crawler._Collection]
crawler_label = [item.tags[0] for item in crawler._Collection]

In [49]:
crawler_LabelNew = ['Business' if label == 'business' else label for label in crawler_label]
crawler_LabelNew = ['Style' if label == 'lifestyle' else label for label in crawler_LabelNew]
crawler_LabelNew = ['Opinion' if label == 'opinion' else label for label in crawler_LabelNew]
crawler_LabelNew = ['Science' if label == 'science' else label for label in crawler_LabelNew]
crawler_LabelNew = ['Sports' if label == 'sport' else label for label in crawler_LabelNew]
crawler_LabelNew = ['Technology' if label == 'tech' else label for label in crawler_LabelNew]
crawler_LabelNew = ['Business' if label == 'economy' else label for label in crawler_LabelNew]
crawler_LabelNew = ['Politics' if label == 'politics' else label for label in crawler_LabelNew]

In [50]:
crawler_LabelNew[:50]

['Style',
 'Style',
 'Style',
 'Style',
 'Style',
 'Opinion',
 'Opinion',
 'Opinion',
 'Opinion',
 'Opinion',
 'Politics',
 'Politics',
 'Politics',
 'Politics',
 'Politics',
 'Science',
 'Science',
 'Science',
 'Science',
 'Science',
 'Sports',
 'Sports',
 'Sports',
 'Sports',
 'Sports',
 'Technology',
 'Technology',
 'Technology',
 'Technology',
 'Technology',
 'Business',
 'Business',
 'Business',
 'Business',
 'Business',
 'Business',
 'Business',
 'Business',
 'Business',
 'Business',
 'Style',
 'Style',
 'Style',
 'Style',
 'Opinion',
 'Opinion',
 'Opinion',
 'Opinion',
 'Opinion',
 'Politics']

In [52]:
print("Anzahl verschiedene Labels",set(crawler_LabelNew))
print("Anzahl Texts",len(crawler_text))

Anzahl verschiedene Labels {'Style', 'Opinion', 'Technology', 'Science', 'Sports', 'Politics', 'Business'}
Anzahl Texts 214


# All Imports

In [3]:
# general
import gensim

# sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, f_classif

# nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from nltk import PorterStemmer, LancasterStemmer


# Numpy
import numpy as np
import re
from collections import Counter

preprocessor = Preprocessor(stopwords = stopwords.words('english'), stemmer = SnowballStemmer("english"))

# Word2Vec

In [9]:
print("Training Word2Vec...")
word2vec = gensim.models.KeyedVectors.load_word2vec_format(os.getcwd() + "/../Word2Vec/glove_model2.txt", binary=False)
word2vec.init_sims(replace=True)
print("Finished Training Word2Vec")

In [7]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = 300
        self.stopwordsList = stopwords.words('english')
    
    def fit(self, X, y):
        return self
    
    def _remove_stopwords(self, document):
        return [word for word in document if word not in self.stopwordsList]
    
    def _mean(self, X):     
        return np.mean([self.word2vec[w] for w in self._remove_stopwords(X.lower().strip().split(' ')) if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
    
    
    def transform(self, documents):         
        return np.array(
            [self._mean(d) for d in documents]
        )

In [8]:
w2v_fu = FeatureUnion([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(word2vec))
])

w2v_svc_ppl = Pipeline([
  ('features', w2v_fu),
  ('scaler', StandardScaler(with_mean=True)),
  ('classifier', SVC(kernel='rbf', probability=True, verbose=100, C = 1000))
])

NameError: name 'word2vec' is not defined

# General Feature Union

In [None]:
fu = FeatureUnion([
    ('ngram_tf_idf', Pipeline([
      ('counts', CountVectorizer(max_df=0.75, ngram_range=(1,3), max_features=2000, preprocessor=preprocessor.get_preprocessed_text)),
      ('tf_idf', TfidfTransformer())
    ])),
    ('lv', Pipeline([
      ('linguistic', LinguisticVectorizer())
    ]))
])

# SVM

In [None]:
svm_fu = FeatureUnion([
    ('ngram_tf_idf', Pipeline([
      ('counts', CountVectorizer(max_df=0.75, ngram_range=(1,3), max_features=2000, preprocessor=preprocessor.get_preprocessed_text)),
      ('tf_idf', TfidfTransformer())
    ])),
    ('lv', Pipeline([
      ('linguistic', LinguisticVectorizer())
    ])),
    ('w2v', Pipeline([
        ("mean_embedding", MeanEmbeddingVectorizer(word2vec))
    ]))
])

svc_high_ppl = Pipeline([
  ('features', svm_fu),
  ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
  ('scaler', StandardScaler(with_mean=True)),
  ('k_best', SelectKBest(score_func=f_classif, k=1800)),
  ('dim_red', PCA(n_components=1500)),
  ('classifier', SVC(kernel='rbf', probability=True, verbose=100, C=1000))
])

# MLP

In [None]:
mlp_fu = FeatureUnion([
    ('ngram_tf_idf', Pipeline([
      ('counts', CountVectorizer(max_df=0.75, ngram_range=(1,3), max_features=4000, preprocessor=preprocessor.get_preprocessed_text)),
      ('tf_idf', TfidfTransformer())
    ])),
    ('lv', Pipeline([
      ('linguistic', LinguisticVectorizer())
    ]))
])

mlp_clf = MLPClassifier(hidden_layer_sizes=(300,300),solver='adam',activation='relu',learning_rate_init=0.01,max_iter=750,verbose=True)

mlp1_pipeline = Pipeline([
  ('features', mlp_fu),
  ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
  ('scaler', StandardScaler(with_mean=True)),
  ('classifier', mlp_clf)
])

# Naiver Bayes

In [None]:
mnb_ppl = Pipeline([
  ('features', fu),
  ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
  ('scaler', MinMaxScaler()),
  ('k_best', SelectKBest(score_func=f_classif, k=1500)),
  ('dim_red', PCA(n_components=1200)),
  ('scaler2', MinMaxScaler()),
  ('classifier', MultinomialNB())
])

# Random Forest

In [None]:
random_200_ppl = Pipeline([
  ('features', fu),
  ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
  ('scaler', StandardScaler(with_mean=True)),
  ('k_best', SelectKBest(score_func=f_classif, k=1500)),
  ('dim_red', PCA(n_components=1200)),
  ('classifier', RandomForestClassifier(verbose=100, n_estimators=200))
])

# Logistic Regression

In [None]:
logistic_pipeline = Pipeline([
  ('features', fu),
  ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
  ('scaler', StandardScaler(with_mean=True)),
  ('k_best', SelectKBest(score_func=f_classif, k=1500)),
  ('dim_red', PCA(n_components=1200)),
  ('classifier', LogisticRegression(verbose=100))
])

# Voting Classifier

# Fitting 

In [4]:
print("Starting Fitting ...")
eclf = None
if not os.path.isfile(eclf_file):
    eclf = VotingClassifier(estimators=[('word2vec', w2v_svc_ppl), ('svm',svc_high_ppl),('mlp',mlp1_pipeline),('nb', mnb_ppl), ('rf', random_200_ppl), ('lr', logistic_pipeline)], voting='soft', weights=[4,2,2,1,1,1])
    eclf.fit(X_train,Y_train)
else:
    with open(eclf_file, 'rb') as handle:
        eclf = dill.load(handle)

Starting Fitting ...


In [26]:
print(eclf.predict_proba([crawler._Collection[1].text]))
print("echte Label", crawler._Collection[1].tags)
print(eclf.classes_)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  1

In [32]:
crawler_label = np.array(crawler_label)

# Evaluation

In [51]:
eclf_predicted = eclf.predict(crawler_text) 
print(metrics.accuracy_score(crawler_LabelNew, eclf_predicted))
print(metrics.classification_report(crawler_LabelNew, eclf_predicted))
print(metrics.cohen_kappa_score(crawler_LabelNew, eclf_predicted))
metrics.confusion_matrix(crawler_LabelNew, eclf_predicted)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  1


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.



array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 43,  3,  1,  0,  1,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  0,  1, 16,  1,  0,  1,  2,  1],
       [ 0,  2,  2,  8, 16,  0,  0,  1,  1],
       [ 0,  1,  9,  2,  0,  9,  0,  0,  4],
       [ 1,  0,  0,  0,  0,  0, 28,  1,  0],
       [ 4,  0,  6,  3,  0,  1,  0,  6,  4],
       [ 1,  3,  0,  2,  0,  0,  0,  0, 24]])

In [54]:
metrics.confusion_matrix(crawler_LabelNew, eclf_predicted, labels=sorted(list(set(crawler_LabelNew))))

array([[43,  1,  0,  1,  0,  0,  2],
       [ 0, 16,  1,  0,  1,  2,  1],
       [ 2,  8, 16,  0,  0,  1,  1],
       [ 1,  2,  0,  9,  0,  0,  4],
       [ 0,  0,  0,  0, 28,  1,  0],
       [ 0,  3,  0,  1,  0,  6,  4],
       [ 3,  2,  0,  0,  0,  0, 24]])

# Export with Dill

In [None]:
if not os.path.isfile(eclf_file):
    with open(eclf_file, 'wb') as handle:
        dill.dump(eclf, handle, protocol=dill.HIGHEST_PROTOCOL)