# Process selected labeled data and create DFs

In [88]:
from datasets import load_dataset
from datasets import Dataset as DT
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format='retina'

import math
from collections import defaultdict
from textwrap import wrap
import numpy as np

import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (AutoModel, AutoTokenizer, AutoConfig,
                          Trainer, TrainingArguments)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.preprocessing import OrdinalEncoder

import wandb

import nltk.data
from nltk.tokenize import sent_tokenize
from nltk.corpus import alpino

from imblearn.over_sampling import RandomOverSampler

In [62]:
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [63]:
PRE_TRAINED_MODEL_NAME = 'wietsedv/bert-base-dutch-cased'
LEN_SENTS = 100

## Load csv

In [65]:
list_cols = ['sentiment', 'text', 'energy', 'article_filepath', 'article_name', 'count', 'date', 'dir', 'index_article', 'index_metadata', 'metadata_filepath',
                    'newspaper_language', 'newspaper_publisher', 'newspaper_source', 'newspaper_title', 'newspaper_volume', 
             'newspaper_issuenumber', 'newspaper_city', 'text_clean', 'type']

In [66]:
def clean_df(df):
    """Function to clean df after concat"""
    df.text.replace('', np.nan, inplace=True)
    df.dropna(subset=['text'], inplace=True)
    df.labels.replace('', np.nan, inplace=True)
    df.dropna(subset=['labels'], inplace=True)
    return df

### Gas

In [67]:
gas_1980 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1980s_gas.csv")
gas_1990 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1990s_gas_labeled.csv")

gas_1980 = gas_1980[list_cols]
gas_1990 = gas_1990[list_cols]

gas = gas_1980.append(gas_1990, ignore_index=True)
gas = gas[gas.energy == "Y"]
gas = gas[gas.sentiment != None]
gas.rename(columns = {"sentiment": "labels"}, inplace=True)
gas = clean_df(gas)

### Oil

In [68]:
oil_1980 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1980s_oil.csv")
oil_1990 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1990s_olie_labeled.csv")

oil_1980 = oil_1980[list_cols]
oil_1990 = oil_1980[list_cols]

oil = oil_1980.append(oil_1990, ignore_index=True)
oil = oil[oil.energy == "Y"]
oil = oil[oil.sentiment != None]
oil.rename(columns = {"sentiment": "labels"}, inplace=True)
oil = clean_df(oil)

### Coal

In [69]:
coal_1980 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1980s_coal.csv")
coal_1990 = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/edo_1990s_kool_labeled.csv")
coal_1990.drop(["sentiment_gas", "sentiment_oil"], axis=1, inplace=True)
coal_1990.rename(columns = {"sentiment": "accuracy_selection", "sentiment_coal": "sentiment"}, inplace=True)

coal_1980 = coal_1980[list_cols]
coal_1990 = coal_1990[list_cols]

coal = coal_1980.append(coal_1990, ignore_index=True)
coal = coal[coal.energy == "Y"]
coal = coal[coal.sentiment != None]
coal.rename(columns = {"sentiment": "labels"}, inplace=True)
coal = clean_df(coal)

### General df

In [70]:
df = pd.concat([gas, oil, coal], ignore_index=True)
df = clean_df(df)
df.shape

(2773, 20)

## Fix labels

In [71]:
cleanup_sentiment = {"labels": {"VN": 1, "NG": 2, "NE": 3, "PO": 4, "VP": 5}}
oil = oil.replace(cleanup_sentiment)
gas = gas.replace(cleanup_sentiment)
coal = coal.replace(cleanup_sentiment)
df = df.replace(cleanup_sentiment)

In [72]:
#ax = sns.countplot(df.sentiment)
#plt.xlabel('review sentiment')

Reduce from 5 labels to 3 because of lack of labels

In [73]:
def to_sentiment(rating):
    rating = int(rating)
    if rating <= 2:
        return 0
    elif rating == 3:
        return 1
    else:
        return 2

df['labels'] = df.labels.apply(to_sentiment)
gas['labels'] = gas.labels.apply(to_sentiment)
coal['labels'] = coal.labels.apply(to_sentiment)
oil['labels'] = oil.labels.apply(to_sentiment)

In [74]:
#ax = sns.countplot(df.sentiment)
#plt.xlabel('review sentiment')

### Split text and explode

In [75]:
def unite(l, n):
    """Unite sentences previously split using nltk.tokenize."""
    count = []
    chunks = []
    sents = []
    for s in l:
        count.append(len(s.split()))
    value = 0
    prev_idx = 0
    for i in range(0, len(count)):
        if value == 0:
            value = value + count[i]
        elif (i+1 == len(count)):
            chunks.append(l[prev_idx:i])
            value = 0
        elif value >= n:
            chunks.append(l[prev_idx:i])
            prev_idx = i
            value = 0
        else:
             value = value + count[i]
    for c in chunks:
        sents.append(' '.join(c))
    return(sents)

In [76]:
def splitter(s, n):
    """Split sentences only using the number of words."""
    pieces = s.split()
    return [" ".join(pieces[i:i+n]) for i in range(0, len(pieces), n)]

In [77]:
def apply_split_text(df):
    df["text_split"] = df["text"].apply(sent_tokenize)
    df["text_split"] = df["text_split"].apply(unite, n = LEN_SENTS)
    df.text_split.replace([], np.nan, inplace=True)
    df.dropna(subset=['text_split'], inplace=True)
    # Cancel all text_split == 0
    df.drop(df[df.text_split.map(len) == 0].index, inplace=True)
    # Currently not splitting the cleaned sentences
    #df["text_clean_split"] = df["text_clean"].apply(splitter, n = LEN_SENTS)
    return df

In [78]:
oil = apply_split_text(oil)
gas = apply_split_text(gas)
coal = apply_split_text(coal)
df = apply_split_text(df)

Explode the sentences that we created previously

In [79]:
df = df.explode('text_split')
gas = gas.explode('text_split')
coal = coal.explode('text_split')
oil = oil.explode('text_split')

In [102]:
df.to_csv("~/dev/hist-aware/notebooks/sentiment/df.csv")
gas.to_csv("~/dev/hist-aware/notebooks/sentiment/gas.csv")
coal.to_csv("~/dev/hist-aware/notebooks/sentiment/coal.csv")
oil.to_csv("~/dev/hist-aware/notebooks/sentiment/oil.csv")