# Baseline for BERT: Tf-idf + Naive Bayes or SVC

In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from scipy import interp
from sklearn.metrics import roc_auc_score

In [35]:
DATA_DIR = "~/dev/hist-aware/notebooks/data/labeled"

## Load data

In [36]:
df = pd.read_csv(os.path.join(DATA_DIR, "labeled_energy_1970_1990.csv"))
oil = pd.read_csv(os.path.join(DATA_DIR, "labeled_oil_1970_1990.csv"))
gas = pd.read_csv(os.path.join(DATA_DIR, "labeled_gas_1970_1990.csv"))
coal = pd.read_csv(os.path.join(DATA_DIR, "labeled_coal_1970_1990.csv"))

## Train / test

In [39]:
from sklearn.model_selection import train_test_split

X = df.text_clean.values
y = label_binarize(df.labels.values, classes=[0, 1, 2])

X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=2020)

## Set GPU for training

In [40]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


## TF-IDF and Naive Bayes

### Data preprocessing

In [41]:
import nltk
from nltk.corpus import stopwords

stopwords = stopwords.words("dutch")

In [42]:
def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

In [43]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess text
X_train_preprocessed = np.array([text_preprocessing(text) for text in X_train])
X_val_preprocessed = np.array([text_preprocessing(text) for text in X_val])

# Calculate TF-IDF
tf_idf = TfidfVectorizer(ngram_range=(1, 3),
                         binary=True,
                         smooth_idf=False)
X_train_tfidf = tf_idf.fit_transform(X_train_preprocessed)
X_val_tfidf = tf_idf.transform(X_val_preprocessed)

CPU times: user 6.1 s, sys: 249 ms, total: 6.34 s
Wall time: 6.45 s


In [44]:
#clf = MultinomialNB().fit(X_train_tfidf, y_train)
#from sklearn.metrics import accuracy_score
#prediction = clf.predict(X_val_tfidf)
#print('Test accuracy is {}'.format(accuracy_score(y_val, prediction)))
#cross_val_score(clf, X_train_tfidf, y_train, scoring='accuracy', cv=10)

In [None]:
classifier = OneVsRestClassifier(MultinomialNB())
prediction = clf.predict(X_val_tfidf)
print('Test accuracy is {}'.format(accuracy_score(y_val, prediction)))

## SVC and Naive Bayes classifier

In [None]:
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
y_score = classifier.fit(X_train_tfidf, y_train).decision_function(X_val_tfidf)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

Plot ROC for one class

In [None]:
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [30]:
from sklearn.metrics import accuracy_score, roc_curve, auc

def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
# Compute predicted probabilities
nb_model = OneVsRestClassifier(MultinomialNB(alpha=1.8))
nb_model.fit(X_train_tfidf, y_train)
probs = clf.predict_proba(X_val_tfidf)

# Evaluate the classifier
evaluate_roc(probs, y_val)