## traditional machine learning classifier - sklearn

In [1]:
# load processed dataset.
import pandas as pd
dataset = pd.read_csv("pre-processed data/dataset_aug_priority.csv", sep=',')
print(len(dataset))
dataset = dataset[dataset['priority'] != 'Unknown']
print(len(dataset))

42762
42761


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

# create vocab according to tf.
count_vec = CountVectorizer(ngram_range=(1, 1), max_features=3000)
tf = count_vec.fit_transform(dataset['content'].values.astype('U')).toarray()
vocab = count_vec.get_feature_names()

# td.idf
tfidf_vec = TfidfVectorizer(ngram_range=(1, 1), min_df=1, vocabulary=vocab)
tfidf= tfidf_vec.fit_transform(dataset['content'].values.astype('U')).toarray()

In [3]:
dataset_tfidf = pd.DataFrame(tfidf, columns=vocab)

dataset_tfidf[:5]

Unnamed: 0,ab,abasand,abc,ability,able,above,abroad,absolute,absolutely,abundance,...,your,yourself,youtube,yr,yukon,zambales,zero,zimbabwe,zone,zulu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
import numpy as np

# split into train and test dataset.
tr_X = dataset_tfidf.sample(frac=0.8, random_state=2020)
tr_y = dataset.sample(frac=0.8, random_state=2020)['priority']
te_X = dataset_tfidf[~dataset_tfidf.index.isin(tr_X.index)]
te_y = dataset[~dataset_tfidf.index.isin(tr_X.index)]['priority']

tr_X, tr_y = np.array(tr_X), np.array(tr_y)
te_X, te_y = np.array(te_X), np.array(te_y)

print(len(tr_X))
print(len(tr_y))
print(len(te_X))
print(len(te_y))

34209
34209
8552
8552


### multi-class classification for 'priority':

In [5]:
tr_y[tr_y == 'Critical'], te_y[te_y == 'Critical'] = 0, 0
tr_y[tr_y == 'Low'], te_y[te_y == 'Low'] = 1, 1
tr_y[tr_y == 'Medium'], te_y[te_y == 'Medium'] = 2, 2
tr_y[tr_y == 'High'], te_y[te_y == 'High'] = 3, 3

In [6]:
# error: Unknown label type: 'unknown'
tr_y=tr_y.astype('int')
te_y=te_y.astype('int')

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# logistic regression.
clf = LogisticRegression(random_state=0, solver='saga', multi_class='auto').fit(tr_X, tr_y)
preds_te = clf.predict(te_X)

print('Accuracy:', accuracy_score(te_y,preds_te))
print('Precision:', precision_score(te_y,preds_te,average='macro'))
print('Recall:', recall_score(te_y,preds_te,average='macro'))
print('F1-Score:', f1_score(te_y,preds_te,average='macro'))


Accuracy: 0.7904583723105706
Precision: 0.8003093310868893
Recall: 0.8113226037861101
F1-Score: 0.8048295539715175


### multi-label classification for 'Categories': one-vs-rest

In [8]:
# load processed dataset.
dataset_cat = pd.read_csv("pre-processed data/new_label_dataset.csv")[['content', 'categories']]

dataset_cat[:4]

Unnamed: 0,content,categories
0,philippine flood worsen death toll hit wake ge...,"['ThirdPartyObservation', 'Factoid', 'News']"
1,philippine flood fatality hit,"['ThirdPartyObservation', 'Factoid', 'News']"
2,luzon dam release water flood warn up manila p...,"['ThirdPartyObservation', 'Factoid', 'News']"
3,pagasa advisory yellow warning metro manila oc...,"['ThirdPartyObservation', 'Factoid', 'News']"


In [9]:
# convert categories into matrix.
import ast
from sklearn.preprocessing import MultiLabelBinarizer

# convert str to list.
cat_list = []
for i in dataset_cat['categories']:
    cat_list.append(ast.literal_eval(i))

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(cat_list)
categories = mlb.classes_

print(labels.shape)

(37293, 25)


In [10]:
# create vocab according to tf.
count_vec = CountVectorizer(ngram_range=(1, 1), max_features=3000)
tf = count_vec.fit_transform(dataset_cat['content'].values.astype('U')).toarray()
vocab = count_vec.get_feature_names()

# td.idf
tfidf_vec = TfidfVectorizer(ngram_range=(1, 1), min_df=1, vocabulary=vocab)
tfidf= tfidf_vec.fit_transform(dataset_cat['content'].values.astype('U')).toarray()

tfidf.shape

(37293, 3000)

In [11]:
# random split into train and test dataset.
from sklearn.model_selection import train_test_split

tr_X, te_X, tr_y, te_y = train_test_split(tfidf, labels, test_size=0.3, random_state=42)

print("tr_X size: ", tr_X.shape)
print("tr_y size: ", tr_y.shape)
print("te_X size: ", te_X.shape)
print("te_y size: ", te_y.shape)

tr_X size:  (26105, 3000)
tr_y size:  (26105, 25)
te_X size:  (11188, 3000)
te_y size:  (11188, 25)


In [12]:
from sklearn.multiclass import OneVsRestClassifier

one_vs_rest = OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)

for i, category in enumerate(categories):
    
    one_vs_rest.fit(tr_X, tr_y[:, i])
    
    preds_te = one_vs_rest.predict(te_X)

    print('Category:', category, ' Accuracy:', accuracy_score(te_y[:, i], preds_te))

Category: Advice  Accuracy: 0.9478012155881301
Category: CleanUp  Accuracy: 0.9948158741508759
Category: ContextualInformation  Accuracy: 0.9617447264926707
Category: Discussion  Accuracy: 0.9254558455488023
Category: Donations  Accuracy: 0.9803360743653915
Category: EmergingThreats  Accuracy: 0.9528065784769396
Category: Factoid  Accuracy: 0.8723632463353593
Category: FirstPartyObservation  Accuracy: 0.8800500536288881
Category: GoodsServices  Accuracy: 0.9956203074722917
Category: Hashtags  Accuracy: 0.7793171254915982
Category: InformationWanted  Accuracy: 0.9925813371469432
Category: Irrelevant  Accuracy: 0.8096174472649267
Category: Location  Accuracy: 0.8934572756524848
Category: MovePeople  Accuracy: 0.9915981408652127
Category: MultimediaShare  Accuracy: 0.7935287808366106
Category: NewSubEvent  Accuracy: 0.9786378262424026
Category: News  Accuracy: 0.8069360028602074
Category: Official  Accuracy: 0.961834107972828
Category: OriginalEvent  Accuracy: 0.9167858419735431
Category: