# Baseline for BERT: Tf-idf + Naive Bayes or SVC

In [107]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from scipy import interp
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import chi2

In [108]:
DATA_DIR = "~/dev/hist-aware/notebooks/data/labeled"

## Load data

In [109]:
df = pd.read_csv(os.path.join(DATA_DIR, "labeled_energy_1970_1990.csv"))
oil = pd.read_csv(os.path.join(DATA_DIR, "labeled_oil_1970_1990.csv"))
gas = pd.read_csv(os.path.join(DATA_DIR, "labeled_gas_1970_1990.csv"))
coal = pd.read_csv(os.path.join(DATA_DIR, "labeled_coal_1970_1990.csv"))

## Train / test

In [110]:
from sklearn.model_selection import train_test_split

X = df.text_clean.values
y = df.labels.values
#y = label_binarize(df.labels.values, classes=[0, 1, 2])

X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=2020)

## Set GPU for training

In [111]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
Device name: GeForce RTX 2080 Ti


## TF-IDF and Naive Bayes

### Data preprocessing

In [112]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words("dutch")

In [113]:
def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

### Preprocess already cleaned text

In [114]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess text
X_train = np.array([text_preprocessing(text) for text in X_train])
X_val = np.array([text_preprocessing(text) for text in X_val])

CPU times: user 2.68 s, sys: 35 ms, total: 2.72 s
Wall time: 2.72 s


### Baseline Pipeline: TF-IDF vectorizer and MultinomialNB

In [121]:
text_clf_mnb = Pipeline([
    ("tf-idf", TfidfVectorizer()),
    ("clf",  MultinomialNB())
])
text_clf_mnb.fit(X_train, y_train)

predicted = text_clf_mnb.predict(X_val)
np.mean(predicted == y_val)

0.7256140350877193

### Pipeline with SDG

In [122]:
from sklearn.linear_model import SGDClassifier
text_clf_sdg = Pipeline([
    ('tf-idf', TfidfVectorizer(ngram_range=(1, 2), binary=True, smooth_idf=False)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
text_clf_sdg.fit(X_train, y_train)

predicted = text_clf_sdg.predict(X_val)
np.mean(predicted == y_val)

0.7656140350877193

### Predicted results

In [123]:
print(metrics.classification_report(y_val, predicted))
metrics.confusion_matrix(y_val, predicted)

              precision    recall  f1-score   support

           0       0.86      0.70      0.77       473
           1       0.72      0.96      0.82       719
           2       0.83      0.30      0.45       233

    accuracy                           0.77      1425
   macro avg       0.80      0.65      0.68      1425
weighted avg       0.79      0.77      0.74      1425



array([[331, 132,  10],
       [ 25, 689,   5],
       [ 27, 135,  71]])

### Parameters search

In [124]:
parameters = {
    'tf-idf__use_idf': (True, False),
    'tf-idf__norm': ('l1', 'l2', None),
    'tf-idf__max_df': (0.5, 0.75, 1.0),
    'tf-idf__max_features': (None, 5000, 10000, 50000),
    'tf-idf__ngram_range': ((1, 1), (1, 2), (1,3)),
    'clf__alpha': (0.00001, 0.000001),
}

In [125]:
gs_clf = GridSearchCV(text_clf_mnb, parameters, cv=10, n_jobs=-1, verbose=10)

In [126]:
gs_clf = gs_clf.fit(X_train, y_train)

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed:  

In [127]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 1e-06
tf-idf__max_df: 0.5
tf-idf__max_features: None
tf-idf__ngram_range: (1, 3)
tf-idf__norm: 'l1'
tf-idf__use_idf: True


## Prediction

In [128]:
from sklearn.linear_model import SGDClassifier
text_clf_sdg = Pipeline([
    ("tf-idf", TfidfVectorizer(max_df = 0.5, max_features=None, ngram_range = (1, 3), norm = 'l1', use_idf = True)),
    ("clf",  MultinomialNB(alpha = 1e-06))
])
text_clf_sdg.fit(X_train, y_train)

predicted = text_clf_sdg.predict(X_val)
np.mean(predicted == y_val)

0.8336842105263158

Find all csv files

In [135]:
import os
for file in os.listdir("/home/leonardovida/hist-aware/data/to_label"):
    if file.endswith(".csv"):
        file_path = os.path.join("/home/leonardovida/hist-aware/data/to_label", file)
        df = pd.read_csv(file_path)
        df = df["text_clean"]

/home/leonardovida/hist-aware/data/to_label/1980s_olie_to_label_.csv
/home/leonardovida/hist-aware/data/to_label/1980s_kool_to_label.csv
/home/leonardovida/hist-aware/data/to_label/1990s_olie_to_label.csv
/home/leonardovida/hist-aware/data/to_label/1980s_gas_to_label.csv
/home/leonardovida/hist-aware/data/to_label/1990s_kool_to_label.csv
/home/leonardovida/hist-aware/data/to_label/1970s_kool_to_label.csv
/home/leonardovida/hist-aware/data/to_label/1990s_gas_to_label.csv
/home/leonardovida/hist-aware/data/to_label/1970s_olie_to_label.csv
/home/leonardovida/hist-aware/data/to_label/1970s_gas_to_label.csv
