# Baseline for BERT: Tf-idf + Naive Bayes or SVC

In [11]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from scipy import interp
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import chi2

In [12]:
DATA_DIR = "~/dev/hist-aware/notebooks/data/labeled"

## Load data

In [19]:
df = pd.read_csv(os.path.join(DATA_DIR, "labeled_energy_1960_1990.csv"))
oil = pd.read_csv(os.path.join(DATA_DIR, "labeled_oil_1960_1990.csv"))
gas = pd.read_csv(os.path.join(DATA_DIR, "labeled_gas_1960_1990.csv"))
coal = pd.read_csv(os.path.join(DATA_DIR, "labeled_coal_1960_1990.csv"))

## Train / test

In [20]:
from sklearn.model_selection import train_test_split

X = df.text_clean.values
y = df.labels.values
#y = label_binarize(df.labels.values, classes=[0, 1, 2])

X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=2020)

## Set GPU for training

In [6]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


## TF-IDF and Naive Bayes

### Data preprocessing

In [21]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words("dutch")

In [22]:
def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove trailing whitespace
    """
    s = s.lower()
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

### Preprocess already cleaned text

In [23]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess text
X_train = np.array([text_preprocessing(text) for text in X_train])
X_val = np.array([text_preprocessing(text) for text in X_val])

CPU times: user 1.81 s, sys: 26.5 ms, total: 1.83 s
Wall time: 1.85 s


### Baseline Pipeline: TF-IDF vectorizer and MultinomialNB

In [9]:
text_clf_mnb = Pipeline([
    ("tf-idf", TfidfVectorizer()),
    ("clf",  MultinomialNB())
])
text_clf_mnb.fit(X_train, y_train)

predicted = text_clf_mnb.predict(X_val)
np.mean(predicted == y_val)

0.6460176991150443

### Pipeline with SDG

In [10]:
from sklearn.linear_model import SGDClassifier
text_clf_sdg = Pipeline([
    ('tf-idf', TfidfVectorizer(ngram_range=(1, 2), binary=True, smooth_idf=False)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
text_clf_sdg.fit(X_train, y_train)

predicted = text_clf_sdg.predict(X_val)
np.mean(predicted == y_val)

0.7103781174577635

### Predicted results

In [63]:
print(metrics.classification_report(y_val, predicted))
metrics.confusion_matrix(y_val, predicted)

ValueError: Found input variables with inconsistent numbers of samples: [285, 528]

### Parameters search

In [124]:
parameters = {
    'tf-idf__use_idf': (True, False),
    'tf-idf__norm': ('l1', 'l2', None),
    'tf-idf__max_df': (0.5, 0.75, 1.0),
    'tf-idf__max_features': (None, 5000, 10000, 50000),
    'tf-idf__ngram_range': ((1, 1), (1, 2), (1,3)),
    'clf__alpha': (0.00001, 0.000001),
}

In [125]:
gs_clf = GridSearchCV(text_clf_mnb, parameters, cv=10, n_jobs=-1, verbose=10)

In [126]:
gs_clf = gs_clf.fit(X_train, y_train)

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed:  

In [127]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 1e-06
tf-idf__max_df: 0.5
tf-idf__max_features: None
tf-idf__ngram_range: (1, 3)
tf-idf__norm: 'l1'
tf-idf__use_idf: True


## Prediction

In [25]:
from sklearn.linear_model import SGDClassifier
text_clf_sdg = Pipeline([
    ("tf-idf", TfidfVectorizer(max_df = 0.5, max_features=None, ngram_range = (1, 3), norm = 'l1', use_idf = True)),
    ("clf",  MultinomialNB(alpha = 1e-06))
])
text_clf_sdg.fit(X_train, y_train)

predicted = text_clf_sdg.predict(X_val)
np.mean(predicted == y_val)

0.6338406445837064

Find all csv files

---

# Predictions

In [9]:
DIR = "/Users/leonardovida/Dropbox/work/1_projects/2_histaware/data/raw/raw_selected"
import os
import imblearn

df = []
for root, dirs, files in os.walk(DIR):
    for file in files:
        if file.endswith('.csv'):
            if "oil" in file:
                df.append(pd.read_csv(os.path.join(root, file)))
                print(os.path.join(root, file))

/Users/leonardovida/Dropbox/work/1_projects/2_histaware/data/raw/raw_selected/1960s/to_label_oil.csv
/Users/leonardovida/Dropbox/work/1_projects/2_histaware/data/raw/raw_selected/1970s/to_label_oil.csv


## Oil

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imb

# Load data
oil = pd.read_csv(os.path.join(DATA_DIR, "labeled_oil_1960_1990.csv"))
df = oil.copy()

# Copy vectors
X = df.text_clean.values
y = df.labels.values

# Split
X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=2020)

# Preprocess text
X_train = np.array([text_preprocessing(text) for text in X_train])
X_val = np.array([text_preprocessing(text) for text in X_val])

# Pipeline
model_oil = make_pipeline_imb(
    TfidfVectorizer(max_df = 0.5, max_features=None, ngram_range = (1, 3), norm = 'l1', use_idf = True),
    RandomOverSampler(sampling_strategy='minority'),
    MultinomialNB(alpha = 1e-06))
model_oil.fit(X_train, y_train)
y_pred = model_oil.predict(X_val)

print(classification_report_imbalanced(y_val, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.68      0.93      0.75      0.80      0.62       159
          1       0.75      0.92      0.71      0.83      0.80      0.66       230
          2       0.82      0.60      0.97      0.69      0.76      0.56        75

avg / total       0.79      0.78      0.83      0.78      0.79      0.63       464



In [27]:
df_oil = pd.read_csv("/Users/leonardovida/Dropbox/work/1_projects/2_histaware/data/raw/raw_selected/1970s/to_label_oil.csv")

In [28]:
df_oil.dropna(subset=["text_clean"], inplace=True)
preds = model_oil.predict(df_oil["text_clean"])
df_oil["prediction_sentiment_nb"] = preds

## Gas

In [29]:
# Load data
gas = pd.read_csv(os.path.join(DATA_DIR, "labeled_gas_1960_1990.csv"))
df = gas.copy()

# Copy vectors
X = df.text_clean.values
y = df.labels.values

# Split
X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=2020)

# Preprocess text
X_train = np.array([text_preprocessing(text) for text in X_train])
X_val = np.array([text_preprocessing(text) for text in X_val])

# Pipeline
model_gas = make_pipeline_imb(
    TfidfVectorizer(max_df = 0.5, max_features=None, ngram_range = (1, 3), norm = 'l1', use_idf = True),
    RandomOverSampler(sampling_strategy='minority'),
    MultinomialNB(alpha = 1e-06))
model_gas.fit(X_train, y_train)
y_pred = model_gas.predict(X_val)

print(classification_report_imbalanced(y_val, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      0.33      0.96      0.46      0.57      0.30        87
          1       0.57      0.88      0.32      0.69      0.53      0.30       186
          2       0.66      0.29      0.95      0.40      0.52      0.26        93

avg / total       0.63      0.60      0.63      0.56      0.54      0.29       366



In [30]:
df_gas = pd.read_csv("/Users/leonardovida/Dropbox/work/1_projects/2_histaware/data/raw/raw_selected/1970s/to_label_gas.csv")

In [31]:
df_gas.dropna(subset=["text_clean"], inplace=True)
preds = model_gas.predict(df_gas["text_clean"])
df_gas["prediction_sentiment_nb"] = preds

## Coal

In [41]:
# Load data
coal = pd.read_csv(os.path.join(DATA_DIR, "labeled_coal_1960_1990.csv"))
df = coal.copy()

# Copy vectors
X = df.text_clean.values
y = df.labels.values

# Split
X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=2020)

# Preprocess text
X_train = np.array([text_preprocessing(text) for text in X_train])
X_val = np.array([text_preprocessing(text) for text in X_val])

# Pipeline
model_coal = make_pipeline_imb(
    TfidfVectorizer(max_df = 0.5, max_features=None, ngram_range = (1, 3), norm = 'l1', use_idf = True),
    RandomOverSampler(sampling_strategy='minority'),
    MultinomialNB(alpha = 1e-06))
model_coal.fit(X_train, y_train)
y_pred = model_coal.predict(X_val)

print(classification_report_imbalanced(y_val, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.75      0.51      0.93      0.61      0.69      0.46        82
          1       0.55      0.67      0.66      0.60      0.66      0.44       111
          2       0.56      0.57      0.78      0.56      0.66      0.43        95

avg / total       0.61      0.59      0.77      0.59      0.67      0.44       288



In [33]:
df_coal = pd.read_csv("/Users/leonardovida/Dropbox/work/1_projects/2_histaware/data/raw/raw_selected/1970s/to_label_coal.csv")

In [34]:
df_coal.dropna(subset=["text_clean"], inplace=True)
preds = model_coal.predict(df_coal["text_clean"])
df_coal["prediction_sentiment_nb"] = preds

In [35]:
df_coal.to_csv("/Users/leonardovida/Desktop/df_coal_nb.csv")
df_oil.to_csv("/Users/leonardovida/Desktop/df_oil_.csv")
df_gas.to_csv("/Users/leonardovida/Desktop/df_gas_nb.csv")