In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random
import os, sys
from sklearn.model_selection import train_test_split
import json
import pandas as pd
import os
import pickle

# Label the data

In [2]:
def read_data(file_name):
    data = []
    f = open(file_name,'r')
    for line in f:
        if '&apos;' in line: continue
        data.append(line.strip())
    return data
data_news = read_data('news_data.txt')
data_fed = read_data('fed_data.txt') 
labels = []
for i in range(len(data_news)):
    labels.append('geopolitical')
for i in range(len(data_fed)):
    labels.append('market')

df = pd.DataFrame(list(zip(data_news + data_fed, labels)), 
               columns =['title', 'classes'])

In [11]:
df.head()

Unnamed: 0,title,classes
0,daybreak,geopolitical
1,australia,geopolitical
2,full,geopolitical
3,show,geopolitical
4,daybreak,geopolitical


# Training the model with Naive Bayes

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df.classes)
X = df.title

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25,random_state = 1)



In [5]:
## Baseline model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import f1_score
import numpy as np

# LabelPowerset allows for multi-label classification
# Build a pipeline for multinomial naive bayes classification

text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                     #('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB()))])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted, average='weighted')  ))

mean accuracy: 0.996422893481717
f1 score: 0.9973926204321707


In [6]:
## with tfidf transformer:
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-2))),])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted, average='weighted')  ))

mean accuracy: 0.9844992050874404
f1 score: 0.9910872461217234


# Try more models

In [7]:
# Test if SVM performs better
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=10, random_state=42)))])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

#Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted_svm == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_svm, average='weighted')  ))

mean accuracy: 0.9952305246422893
f1 score: 0.9963897981451173




In [8]:
## random forest
from sklearn.ensemble import RandomForestClassifier
text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-rf', LabelPowerset(
                             RandomForestClassifier(n_estimators = 100)))])
_ = text_clf_rf.fit(X_train, y_train)
predicted_rf = text_clf_rf.predict(X_test)

#Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted_rf == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_rf, average='weighted')  ))

mean accuracy: 0.996422893481717
f1 score: 0.9973926204321707


In [9]:
# GBDT
from sklearn.ensemble import GradientBoostingClassifier

text_clf_gbdt = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-gbdt', LabelPowerset(
                             GradientBoostingClassifier(n_estimators=200)))])
_ = text_clf_gbdt.fit(X_train, y_train)

predicted_gbdt = text_clf_gbdt.predict(X_test)

#Calculate accuracy
print('mean accuracy: {}'.format(np.mean(predicted_gbdt == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_gbdt, average='weighted')  ))

mean accuracy: 0.9988076311605724
f1 score: 0.9991820966353138


In [10]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

text_clf_lr = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1,1))),
                         ('tfidf', TfidfTransformer(use_idf=False)),
                         ('clf-lr', LabelPowerset(
                             LogisticRegression(C=10)))])
_ = text_clf_lr.fit(X_train, y_train)
predicted_lr = text_clf_lr.predict(X_test)

#Calculate accuracy
np.mean(predicted_lr == y_test)

print('mean accuracy: {}'.format(np.mean(predicted_lr == y_test)))
print('f1 score: {}'.format(f1_score(y_test, predicted_lr, average='weighted')  ))

mean accuracy: 0.996422893481717
f1 score: 0.9973926204321707


