# Create Dataset

In [1]:
from datasets import load_dataset
from datasets import Dataset as DT
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format='retina'

import math
from collections import defaultdict
from textwrap import wrap
import numpy as np

import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OrdinalEncoder

import wandb

import nltk.data
from nltk.tokenize import sent_tokenize
from nltk.corpus import alpino

from imblearn.over_sampling import RandomOverSampler

In [2]:
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
PRE_TRAINED_MODEL_NAME = 'wietsedv/bert-base-dutch-cased'
LEN_SENTS = 100

In [4]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=241440.0, style=ProgressStyle(descripti…




## Load csv

In [5]:
list_cols = ['sentiment', 'text', 'energy', 'article_filepath', 'article_name', 'count', 'date', 'dir', 'index_article', 'index_metadata', 'metadata_filepath',
                    'newspaper_language', 'newspaper_publisher', 'newspaper_source', 'newspaper_title', 'newspaper_volume', 
             'newspaper_issuenumber', 'newspaper_city', 'text_clean', 'type']

In [6]:
def clean_df(df):
    """Function to clean df after concat"""
    df.text.replace('', np.nan, inplace=True)
    df.dropna(subset=['text'], inplace=True)
    df.labels.replace('', np.nan, inplace=True)
    df.dropna(subset=['labels'], inplace=True)
    return df

### Gas

In [7]:
gas_1980 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1980s_gas.csv")
gas_1990 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1990s_gas_labeled.csv")

gas_1980 = gas_1980[list_cols]
gas_1990 = gas_1990[list_cols]

gas = gas_1980.append(gas_1990, ignore_index=True)
gas = gas[gas.energy == "Y"]
gas = gas[gas.sentiment != None]
gas.rename(columns = {"sentiment": "labels"}, inplace=True)
gas = clean_df(gas)

### Oil

In [8]:
oil_1980 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1980s_oil.csv")
oil_1990 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1990s_olie_labeled.csv")

oil_1980 = oil_1980[list_cols]
oil_1990 = oil_1980[list_cols]

oil = oil_1980.append(oil_1990, ignore_index=True)
oil = oil[oil.energy == "Y"]
oil = oil[oil.sentiment != None]
oil.rename(columns = {"sentiment": "labels"}, inplace=True)
oil = clean_df(oil)

### Coal

In [9]:
coal_1980 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1980s_coal.csv")
coal_1990 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1990s_kool_labeled.csv")
coal_1990.drop(["sentiment_gas", "sentiment_oil"], axis=1, inplace=True)
coal_1990.rename(columns = {"sentiment": "accuracy_selection", "sentiment_coal": "sentiment"}, inplace=True)

coal_1980 = coal_1980[list_cols]
coal_1990 = coal_1990[list_cols]

coal = coal_1980.append(coal_1990, ignore_index=True)
coal = coal[coal.energy == "Y"]
coal = coal[coal.sentiment != None]
coal.rename(columns = {"sentiment": "labels"}, inplace=True)
coal = clean_df(coal)

### General df

In [10]:
df = pd.concat([gas, oil, coal], ignore_index=True)
df = clean_df(df)
df.shape

(2773, 20)

## Fix labels

In [11]:
cleanup_sentiment = {"labels": {"VN": 1, "NG": 2, "NE": 3, "PO": 4, "VP": 5}}
oil = oil.replace(cleanup_sentiment)
gas = gas.replace(cleanup_sentiment)
coal = coal.replace(cleanup_sentiment)
df = df.replace(cleanup_sentiment)

In [12]:
#ax = sns.countplot(df.sentiment)
#plt.xlabel('review sentiment')

Reduce from 5 labels to 3 because of lack of labels

In [13]:
def to_sentiment(rating):
    rating = int(rating)
    if rating <= 2:
        return 0
    elif rating == 3:
        return 1
    else:
        return 2

df['labels'] = df.labels.apply(to_sentiment)
gas['labels'] = gas.labels.apply(to_sentiment)
coal['labels'] = coal.labels.apply(to_sentiment)
oil['labels'] = oil.labels.apply(to_sentiment)

In [14]:
#ax = sns.countplot(df.sentiment)
#plt.xlabel('review sentiment')

### Split text and explode

In [15]:
def unite(l, n):
    """Unite sentences previously split using nltk.tokenize."""
    count = []
    chunks = []
    sents = []
    for s in l:
        count.append(len(s.split()))
    value = 0
    prev_idx = 0
    for i in range(0, len(count)):
        if value == 0:
            value = value + count[i]
        elif (i+1 == len(count)):
            chunks.append(l[prev_idx:i])
            value = 0
        elif value >= n:
            chunks.append(l[prev_idx:i])
            prev_idx = i
            value = 0
        else:
             value = value + count[i]
    for c in chunks:
        sents.append(' '.join(c))
    return(sents)

In [16]:
def splitter(s, n):
    """Split sentences only using the number of words."""
    pieces = s.split()
    return [" ".join(pieces[i:i+n]) for i in range(0, len(pieces), n)]

In [17]:
def apply_split_text(df):
    df["text_split"] = df["text"].apply(sent_tokenize)
    df["text_split"] = df["text_split"].apply(unite, n = LEN_SENTS)
    df.text_split.replace([], np.nan, inplace=True)
    df.dropna(subset=['text_split'], inplace=True)
    # Cancel all text_split == 0
    df.drop(df[df.text_split.map(len) == 0].index, inplace=True)
    # Currently not splitting the cleaned sentences
    #df["text_clean_split"] = df["text_clean"].apply(splitter, n = LEN_SENTS)
    return df

In [22]:
oil = apply_split_text(oil)
gas = apply_split_text(gas)
coal = apply_split_text(coal)
df = apply_split_text(df)

Explode the sentences that we created previously

In [23]:
df = df.explode('text_split')
gas = gas.explode('text_split')
coal = coal.explode('text_split')
oil = oil.explode('text_split')

In [57]:
df.to_csv("~/dev/hist-aware/notebooks/sentiment/df.csv")
#gas.to_csv("~/dev/hist-aware/notebooks/sentiment/gas.csv")
#coal.to_csv("~/dev/hist-aware/notebooks/sentiment/coal.csv")
#oil.to_csv("~/dev/hist-aware/notebooks/sentiment/oil.csv")

## Create Dataset

#### Using Dataset from HF

In [24]:
dataset = DT.from_pandas(oil)

In [56]:
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.2)
dataset = dataset.map(lambda par: tokenizer(
    par['text_split'],
    truncation=True,
    padding=True),
                      batched=True)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [49]:
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.2)
dataset = dataset.map(lambda par: tokenizer.encode_plus(
            par['text_split'],
            add_special_tokens=True,
            padding='max_length',
            max_length=350,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        ))

HBox(children=(FloatProgress(value=0.0, max=2686.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=672.0), HTML(value='')))




#### Manual dataset creation with Dataset class

In [37]:
DF = df
MAX_LEN = 350

# Parameters
batch_size = 8
num_workers = 0
max_epochs = 100

In [46]:
from sklearn.model_selection import train_test_split
# Divide into train and val
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text_split"], df["labels"], test_size=.3)
# Divide into val and test
test_texts, val_texts, test_labels, val_labels = train_test_split(val_texts, val_labels, test_size=.5)

In [48]:
class HADataset(Dataset):
    # Characterizes a dataset for Pytorch
    def __init__(self, articles, labels, tokenizer, max_len):
        # Initialization
        self.articles = articles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        # Total number of articles
        return len(self.articles)

    def __getitem__(self, item):
        # Generates one sample of the data/article
        article = str(self.articles[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            article,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
          'article_text': article,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }

In [51]:
train_dataset = HADataset(train_texts.to_numpy(), train_labels.to_numpy(), tokenizer, MAX_LEN)
val_dataset = HADataset(val_texts.to_numpy(), val_labels.to_numpy(), tokenizer, MAX_LEN)
test_dataset = HADataset(test_texts.to_numpy(), test_labels.to_numpy(), tokenizer, MAX_LEN)

In [32]:
def create_data_loader(df, tokenizer, batch_size, num_workers, MAX_LEN):
    ds = HADataset(
        articles=df.text_split.to_numpy(),
        labels=df.labels.to_numpy(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
      )
    
    return DataLoader(
        ds,
        batch_size,
        num_workers
      )

In [33]:
# Create dataloaders
train_data_loader = create_data_loader(DF, tokenizer, batch_size, num_workers, MAX_LEN)
val_data_loader = create_data_loader(DF, tokenizer, batch_size, num_workers, MAX_LEN)
test_data_loader = create_data_loader(DF, tokenizer, batch_size, num_workers, MAX_LEN)

## Train

In [55]:
from transformers import AutoModel, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = "hist-aware/notebooks/models",
    overwrite_output_dir = False,
    evaluation_strategy="steps",
    per_device_train_batch_size=4, # default is 8
    per_device_eval_batch_size=4, # default is 8
    
    logging_dir="hist-aware/notebooks/logging",
    logging_steps=10,
    
    warmup_steps=500,                # number of warmup steps for learning rate scheduler    
    eval_steps=500,
    weight_decay=0.01,               # strength of weight decay
    num_train_epochs=10,              # total number of training epochs
    
    label_names="labels", # check this
    disable_tqdm=False
)

model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

TypeError: Caught TypeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/leonardovida/.cache/pypoetry/virtualenvs/histaware-NidRwJ64-py3.8/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/leonardovida/.cache/pypoetry/virtualenvs/histaware-NidRwJ64-py3.8/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
TypeError: forward() got an unexpected keyword argument 'labels'


### Automatic Training

In [51]:
from transformers import (AutoModel, AutoTokenizer, AutoConfig,
                          Trainer, TrainingArguments)

config = AutoConfig.from_pretrained(PRE_TRAINED_MODEL_NAME)

bert_model = AutoModel.from_pretrained(
    PRE_TRAINED_MODEL_NAME,
    num_labels = 3)
    #output_attentions = False, # Whether the model returns attentions weights.
    #output_hidden_states = False, # Whether the model returns all hidden-states
    #return_dict=True)

In [52]:
training_args = TrainingArguments(
    output_dir = "hist-aware/notebooks/models",
    overwrite_output_dir = False,
    evaluation_strategy="steps",
    per_device_train_batch_size=4, # default is 8
    per_device_eval_batch_size=4, # default is 8
    logging_dir="hist-aware/notebooks/logging",
    eval_steps=500,
    seed=RANDOM_SEED,
    label_names="labels", # check this
    disable_tqdm=False
)

In [53]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def model_init():
    return bert_model

def compute_metrics_old(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Evaluate during training and a bit more often
# than the default to be able to prune bad trials early.
trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    model_init=model_init,
    #compute_metrics=compute_metrics,
)

trainer.train()
# Defaut objective is the sum of all metrics
# when metrics are provided, so we have to maximize it.
#trainer.hyperparameter_search(
#    direction="maximize", 
#    backend="ray", 
#   n_trials=100, # deafult 100
#    # n_jobs=2  # number of parallel jobs, if multiple GPUs
#)

ValueError: type of [1, 0, 2358, 15730, 22307, 11, 6, 22693, 10537, 30, 23225, 29566, 11, 0, 15266, 11315, 13903, 12005, 8472, 13, 6, 3570, 11552, 13903, 11315, 15266, 22976, 10806, 25, 13041, 20722, 10537, 21953, 13644, 14742, 11, 0, 11281, 18883, 13, 7529, 11037, 13903, 22339, 10532, 5108, 13644, 11130, 17898, 17721, 28515, 21843, 14545, 16348, 22867, 11788, 13, 7778, 22795, 26709, 22384, 17795, 20722, 11130, 20722, 16760, 22331, 15963, 18312, 27102, 11, 13261, 8048, 11, 16804, 22300, 9645, 19732, 13, 3631, 13261, 16058, 20722, 3137, 17572, 11, 10516, 13903, 13261, 22339, 20722, 2511, 20407, 24660, 26751, 113, 8, 0, 13, 2003, 25128, 22777, 5114, 20183, 25108, 16804, 11130, 13136, 16804, 13261, 22795, 17179, 117, 7826, 25206, 27533, 117, 131, 117, 22834, 13644, 1503, 8394, 23984, 8227, 24770, 23967, 8421, 0, 12847, 128, 25099, 26457, 26148, 13, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] unknown: <class 'list'>. Should be one of a python, numpy, pytorch or tensorflow object.