In [28]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re

In [2]:
dataset = pd.read_csv('0-10000-labeled.csv')
dataset_docs = dataset['article'].tolist()
dataset_y = dataset['label'].astype('int').tolist()

In [3]:
print('dataset contains', np.sum(np.array(dataset_y) == 0), 'documents labeled 0')
print('dataset contains', np.sum(np.array(dataset_y) == 1), 'documents labeled 1')

dataset contains 9511 documents labeled 0
dataset contains 489 documents labeled 1


De gelabelde dataset wordt opgesplitst in train en test set:

In [4]:
docs_train, docs_test, y_train, y_test = train_test_split(dataset_docs, dataset_y, test_size=.25)

In [5]:
import huggingface_stats
import nltk
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/erik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Een preprocessor wordt gebruikt om de data te preprocessen. Dit is een class met meerdere methods, bijvoorbeeld om alle woorden in een document te tellen.

In [7]:
preprocessor = huggingface_stats.Preprocessor(stemmer=EnglishStemmer(),
                                              stopwords=stopwords.words('english'),
                                              re_pattern=re.compile("[a-zA-Z]"))

Een 'scorer' berekent een score voor elk document:

In [8]:
scorer = huggingface_stats.tf_idf_document_scorer(preprocessor=preprocessor,
                                                  docs=docs_train,
                                                  labels=y_train,
                                                  show_progress=True)

100%|██████████| 7500/7500 [01:32<00:00, 80.88it/s] 
100%|██████████| 366/366 [00:03<00:00, 113.84it/s]
100%|██████████| 7134/7134 [01:27<00:00, 81.78it/s]


Dit zijn de woorden die het meest en minst wegen voor class 'wel ongeluk':

In [9]:
pd.DataFrame(list(scorer._weights.items())).sort_values(1, ascending=False)

Unnamed: 0,0,1
3009,crash,3.889671
1024,plane,3.541505
945,pilot,2.687521
2644,fire,2.117741
881,flight,2.057691
...,...,...
365,presid,-0.613435
702,court,-0.617461
331,attack,-0.618523
516,iraq,-0.695477


In [13]:
X_train = scorer.batch_score(docs_train, show_progress=True)

100%|██████████| 7500/7500 [01:07<00:00, 111.36it/s]


Score voor positief gelabelde documenten:

In [16]:
np.array(X_train)[np.array(y_train) == 1][:50]

array([ 88.53151078,  64.01887663, 158.98335723,  31.40018523,
        56.22986101,  68.10162414,  71.69435868, 232.20355457,
       190.41200838,  55.03464921,  56.99706555, 213.67608598,
        98.93747684,  45.91690413, 136.95955648, 175.81075971,
        40.35884847, 113.36562077,  56.0401985 , 150.08097168,
        29.70197175,  44.99525309,  76.33336374,  46.32199073,
        46.75371812,  94.35761881, 122.80042263, 167.53192898,
        71.65170911,  56.08722984,  49.28664893,  59.24526329,
       102.92239397,  78.10288666, 100.53407641,  51.17149401,
        32.41561417, -25.4835082 ,  19.02013875,  63.93358852,
        68.92915642,  25.8891323 ,  78.39510083,  77.35182064,
        22.48139252,  17.99116948,  90.07266389,  68.73098812,
        50.55077181,  86.26257556])

score voor negatief gelabelde documenten:

In [17]:
np.array(X_train)[np.array(y_train) == 0][:50]

array([-3.61955682e+00, -1.35272418e+01, -1.29034537e+02,  1.93394236e-02,
       -1.26147714e+01,  6.58372889e+01, -1.68765645e+01, -9.52393161e+00,
       -1.76928424e+01,  1.33443339e+01, -1.36274126e+01, -1.87921391e+01,
       -5.96307229e+00, -3.94181172e+01,  1.14830491e+00, -9.53361189e+01,
       -1.16579525e+00, -3.18168572e+01, -1.19764768e+01, -5.89337637e+01,
        3.26307045e+01, -3.92159424e+00, -5.78311292e+00, -2.60494030e+01,
       -9.02213395e+00,  9.04742714e+01, -2.05744738e+01, -2.15260701e+00,
       -3.81894031e+01, -2.24707874e+01, -1.29371951e+01, -3.83263549e+01,
       -5.12239052e+01,  5.53099438e+00,  1.20188267e+00,  1.52246568e+01,
        1.33868660e+01, -6.51025500e+01,  5.33306370e+00, -2.66979747e+00,
       -1.48947253e+01, -3.69628515e+00, -4.23843606e+01, -2.09175457e+01,
        4.12520420e+01, -7.81213385e+00, -6.43515510e+01,  1.56252888e+01,
       -1.75912831e+01, -1.88524259e+01])

Met die score kan vervolgens logistieke regressie gedaan worden:

In [21]:
model = LogisticRegression(random_state=0).fit(np.array([X_train]).T, y_train)

In [24]:
X_test = scorer.batch_score(docs_test, show_progress=True)

100%|██████████| 2500/2500 [00:22<00:00, 112.64it/s]


In [25]:
y_predict = model.predict(np.array([X_test]).T)

In [29]:
print(classification_report(y_test, y_predict, target_names=['not accident', 'accident']))

              precision    recall  f1-score   support

not accident       0.96      0.99      0.97      2377
    accident       0.41      0.19      0.26       123

    accuracy                           0.95      2500
   macro avg       0.68      0.59      0.61      2500
weighted avg       0.93      0.95      0.94      2500

