## traditional machine learning classifier - sklearn

In [1]:
# load processed dataset.
import pandas as pd
dataset = pd.read_csv("pre-processed data/dataset_aug_priority.csv", sep=',')
testset = pd.read_csv("pre-processed data/new_label_testset.csv", sep=',')

dataset = dataset[dataset['priority'] != 'Unknown']
testset = testset[testset['priority'] != 'Unknown']
print("train set: ", len(dataset))
print("test set: ", len(testset))

train set:  52508
test set:  8179


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

# create vocab according to tf.
count_vec = CountVectorizer(ngram_range=(1, 1), max_features=3000)
tf = count_vec.fit_transform(dataset['content'].values.astype('U')).toarray()
vocab = count_vec.get_feature_names()

# train set.
tfidf_vec = TfidfVectorizer(ngram_range=(1, 1), min_df=1, vocabulary=vocab)
tfidf= tfidf_vec.fit_transform(dataset['content'].values.astype('U')).toarray()

# test set.
tfidf_vec_test = TfidfVectorizer(ngram_range=(1, 1), min_df=1, vocabulary=vocab)
tfidf_test = tfidf_vec_test.fit_transform(testset['content'].values.astype('U')).toarray()

In [3]:
dataset_tfidf = pd.DataFrame(tfidf, columns=vocab)
testset_tfidf = pd.DataFrame(tfidf_test, columns=vocab)

testset_tfidf[:5]

Unnamed: 0,ab,abasand,abbott,abc,ability,able,above,abroad,absolute,absolutely,...,your,yourself,youtube,yr,zambales,zealand,zero,zimbabwe,zone,zulu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.309493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
import numpy as np

tr_X = dataset_tfidf
tr_y = dataset['priority']
te_X = testset_tfidf
te_y = testset['priority']

tr_X, tr_y = np.array(tr_X), np.array(tr_y)
te_X, te_y = np.array(te_X), np.array(te_y)

print(len(tr_X))
print(len(tr_y))
print(len(te_X))
print(len(te_y))

52508
52508
8179
8179


### multi-class classification for 'priority':

In [5]:
tr_y[tr_y == 'Critical'], te_y[te_y == 'Critical'] = 0, 0
tr_y[tr_y == 'Low'], te_y[te_y == 'Low'] = 1, 1
tr_y[tr_y == 'Medium'], te_y[te_y == 'Medium'] = 2, 2
tr_y[tr_y == 'High'], te_y[te_y == 'High'] = 3, 3

In [6]:
# error: Unknown label type: 'unknown'
tr_y=tr_y.astype('int')
te_y=te_y.astype('int')

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# logistic regression.
clf = LogisticRegression(random_state=0, solver='saga', multi_class='auto', max_iter=300).fit(tr_X, tr_y)
preds_te = clf.predict(te_X)

print('Accuracy:', accuracy_score(te_y,preds_te))
print('Precision:', precision_score(te_y,preds_te,average='macro'))
print('Recall:', recall_score(te_y,preds_te,average='macro'))
print('F1-Score:', f1_score(te_y,preds_te,average='macro'))


Accuracy: 0.5104536006846803
Precision: 0.3320920387771979
Recall: 0.6100083806810869
F1-Score: 0.3229423704779127


### TREC-IS metric: RMSE

In [8]:
prob = clf.predict_proba(te_X)
score = 0

for i in range(len(te_y)):
    if te_y[i] == 0: weight = 1
    elif te_y[i] == 1: weight = 0.25
    elif te_y[i] == 2: weight = 0.5
    elif te_y[i] == 3: weight = 0.75
    else: weight = 0
    
    score += (weight - prob[i, te_y[i]] * weight)**2
    
score /= len(te_y)

print("RMSE all: ", score)

RMSE all:  0.04017644775199684


### multi-label classification for 'Categories': one-vs-rest

In [9]:
# load processed dataset.
dataset_cat = pd.read_csv("pre-processed data/dataset_aug_cat.csv")[['content', 'categories']]
testset_cat = pd.read_csv("pre-processed data/new_label_testset.csv")[['content', 'categories']]

testset_cat[:4]

Unnamed: 0,content,categories
0,view shell gregoire,"['FirstPartyObservation', 'Location', 'Emergin..."
1,where one fire please stay safe,"['ThirdPartyObservation', 'MultimediaShare', '..."
2,ab emerge alrt live gregoire prepare evacuate ...,"['MovePeople', 'Location', 'Hashtags']"
3,view timberlea,"['MultimediaShare', 'Hashtags']"


In [10]:
# convert categories into matrix.
import ast
from sklearn.preprocessing import MultiLabelBinarizer

# convert str to list.
cat_tr_list = []
cat_te_list = []

for i in dataset_cat['categories']:
    cat_tr_list.append(ast.literal_eval(i))
    
for i in testset_cat['categories']:
    cat_te_list.append(ast.literal_eval(i))

mlb_tr = MultiLabelBinarizer()
labels_tr = mlb_tr.fit_transform(cat_tr_list)
mlb_te = MultiLabelBinarizer()
labels_te = mlb_te.fit_transform(cat_te_list)

categories_tr = mlb_tr.classes_
categories_te = mlb_te.classes_

print(labels_tr.shape)
print(labels_te.shape)

(45606, 25)
(8179, 25)


In [11]:
# create vocab according to tf.
count_vec = CountVectorizer(ngram_range=(1, 1), max_features=3000)
tf = count_vec.fit_transform(dataset_cat['content'].values.astype('U')).toarray()
vocab = count_vec.get_feature_names()

# td.idf
tfidf_vec = TfidfVectorizer(ngram_range=(1, 1), min_df=1, vocabulary=vocab)

# train set.
tfidf_tr= tfidf_vec.fit_transform(dataset_cat['content'].values.astype('U')).toarray()

# test set.
tfidf_te= tfidf_vec.fit_transform(testset_cat['content'].values.astype('U')).toarray()

print(tfidf_tr.shape)
print(tfidf_te.shape)

(45606, 3000)
(8179, 3000)


In [12]:
# random split into train and test dataset.
from sklearn.model_selection import train_test_split

tr_X, te_X, tr_y, te_y = tfidf_tr, tfidf_te, labels_tr, labels_te

print("tr_X size: ", tr_X.shape)
print("tr_y size: ", tr_y.shape)
print("te_X size: ", te_X.shape)
print("te_y size: ", te_y.shape)

tr_X size:  (45606, 3000)
tr_y size:  (45606, 25)
te_X size:  (8179, 3000)
te_y size:  (8179, 25)


In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

one_vs_rest = OneVsRestClassifier(LogisticRegression(class_weight="balanced", random_state=0, solver='lbfgs', max_iter=200), n_jobs=1)
preds_te_arr = np.zeros(te_y.shape) 
accuracy = 0

for i, category in enumerate(categories_tr):
    
    one_vs_rest.fit(tr_X, tr_y[:, i])
    preds_te = one_vs_rest.predict(te_X)
    preds_te_arr[:, i] = preds_te
    res = accuracy_score(te_y[:, i], preds_te)
    print('Category:', category, ' F1-score:', f1_score(te_y[:, i], preds_te), ' Accuracy: ', res)
    accuracy+=res

Category: Advice  F1-score: 0.22666087711680422  Accuracy:  0.7822472184863676
Category: CleanUp  F1-score: 0.22448979591836737  Accuracy:  0.9814158210050128
Category: ContextualInformation  F1-score: 0.050031269543464665  Accuracy:  0.8142804743856217
Category: Discussion  F1-score: 0.13959085439229843  Accuracy:  0.6503240004890574
Category: Donations  F1-score: 0.30809399477806787  Accuracy:  0.9675999510942658
Category: EmergingThreats  F1-score: 0.23969213853765806  Accuracy:  0.8309084240127155
Category: Factoid  F1-score: 0.32144198272624863  Accuracy:  0.7790683457635408
Category: FirstPartyObservation  F1-score: 0.14710568242166755  Accuracy:  0.6072869543954028
Category: GoodsServices  F1-score: 0.07246376811594202  Accuracy:  0.9843501650568529
Category: Hashtags  F1-score: 0.6376294976049904  Accuracy:  0.602274116640176
Category: InformationWanted  F1-score: 0.09302325581395347  Accuracy:  0.9475486000733586
Category: Irrelevant  F1-score: 0.4218671152228763  Accuracy:  0

In [14]:
print('Accuracy all: ', accuracy/len(categories_tr))
print('F1-Score all: ', f1_score(te_y,preds_te_arr,average='macro'))

Accuracy all:  0.8191270326445776
F1-Score all:  0.24268340774030736
