In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import pickle
import glob
from sklearn.metrics import confusion_matrix, classification_report, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from text_manager import TextManager
from embedding import Word2VecEmbedding
from cnn_model import CNN_Model
from tensorflow.keras.utils import to_categorical
import numpy as np

In [2]:
TEMP_DATA_DIR = '../../data/tmp'
EMBEDDING_PATH = "../../data/320/combined-320.txt"
LABELS_DIR = '../../data/labeled_all'
OUTPUT_DIR = '../../data/output'
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 11600

In [3]:
def load_dataset(data_fp):
    """Read the data of the file locations given as argument to this function
        Parameters
        ----------
        data_fp: str
                file path of data
                
        Return
        ----------
        'text' and 'label' of the given file
    """
    fps = glob.glob(data_fp)
    frames =[]
    for fp in fps:
        d = pd.read_csv(fp)
        frames.append(d)

    df = pd.concat(frames)
    
    # make texts and labels
    texts = df['text'].fillna('')
    labels = df["labels"]

    return texts.values, labels.values


def data_prep(data_fp,output_fp='hist_aware_pickle.pickle'):
    """Preprocess data including cleaning and tokenizing, 
         save data, embedding layer, tokenizer and word-index in a pickle file
         
        Parameters
        ----------
        data_fp: str
                file path of data
        output_fp: str
                file path of output(pickle)        
    """

    data_fp = os.path.join(LABELS_DIR,data_fp)
    pickle_fp = os.path.join(TEMP_DATA_DIR, output_fp)
    tokenz_pickle_fp = os.path.join(TEMP_DATA_DIR, 'tokenz_'+output_fp)
    
    dir_path = os.path.dirname(pickle_fp)
    
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
       
    # load the dataset from disk
    texts, lbls = load_dataset(data_fp)
    

    # get the texts and their corresponding labels
    textManager = TextManager(
        max_num_words = MAX_NUM_WORDS,
        max_sequence_length = MAX_SEQUENCE_LENGTH
    )
    
    texts = textManager.clean_text(texts)
    print('max length of all texts', len(max(texts, key=len)))
    
    word_index, tokenizer = textManager.create_tokenizer(texts)
    data = textManager.sequence_maker(tokenizer, texts)
    
    embedding = Word2VecEmbedding(word_index, MAX_NUM_WORDS,
                                  MAX_SEQUENCE_LENGTH)
    embedding.load_word2vec_data(EMBEDDING_PATH)
    embedding_layer = embedding.build_embedding()

    labels = to_categorical(np.asarray(lbls))
    
    # Dataset, labels and embedding layer are stored to disk in pickle file. 
    with open(pickle_fp, 'wb') as f:
        pickle.dump((data, labels, embedding_layer), f)
    
      
    with open(tokenz_pickle_fp, 'wb') as f:
        pickle.dump((tokenizer,word_index), f)


def train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims=50, model_fp_suff='all',pkl_fp='hist_aware_pickle.pickle'):
        """Train a cnn model, using the given parameters,
                save the model and make a prediction on validation dataset
        """
         
        pickle_fp = os.path.join(TEMP_DATA_DIR,pkl_fp )
        
        """ Read dataset, labels and embedding layer from pickle file. """
        with open(pickle_fp, 'rb') as f:
            data, labels, embedding_layer = pickle.load(f)


        """ Split dataset to train and test """
        x_train, x_val, y_train, y_val = train_test_split(data, labels,
                                                    test_size=0.33,
                                                    random_state=0,
                                                    stratify=labels)
            
        print("x_train shape:", x_train.shape, ", x_val shape:", x_val.shape)
        print("y_train shape:", y_train.shape, ", y_val shape:", y_val.shape)
       
    
        """ Make a cnn model """
        deep_model = CNN_Model
        args_model = {
            'dropout' : dropout , 
            'optimizer': optimizer, 
            'max_sequence_length': MAX_SEQUENCE_LENGTH,
            'embedding_layer': embedding_layer,
            'kernel_size' : (3, 8), 
            'num_filters':  128,
            'hidden_dims': hidden_dims
        }
            
        """ Train model, save model, and make prediction """
        model = deep_model(**args_model)
        model.train(x_train, y_train, x_val, y_val, batch_size, epoch_no)
        
        
        if not os.path.exists(OUTPUT_DIR):
                os.makedirs(OUTPUT_DIR)
        export_path = os.path.join(OUTPUT_DIR, 'cnn_model','dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.h5'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
        model.save(export_path)

        pred = model.predict(x_val)
        
         """Save result in dataframe """
        df_y = pd.DataFrame({'sent_0': y_val[:,0],'sent_1': y_val[:,1],'sent_2': y_val[:,2]})
        df_pred = pd.DataFrame({'sent_0': pred[:,0],'sent_1': pred[:,1],'sent_2': pred[:,2]})

        result_df = pd.concat([df_y.idxmax(axis=1),df_pred.idxmax(axis=1)], axis=1)
        result_df.columns =['y_val','pred']


        """Save the result to a file"""

        if not os.path.exists(OUTPUT_DIR):
                os.makedirs(OUTPUT_DIR)
        export_path = os.path.join(OUTPUT_DIR, 'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
        result_df.to_csv(export_path,index=False)
        df_pred.to_csv(os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_preds_prob_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff)))

        return model

def get_classification_report(y_test, preds):
    cr = classification_report(y_test, preds , output_dict=True)
    return pd.DataFrame(cr).transpose()

## Data preparation

### The entire dataset

In [17]:
data_fp = "../../data/labeled_all/*.csv"
output_fp='hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

../../data/tmp
max length of all texts 11644
Found 28422 unique tokens.
Shape of data tensor: (3095, 11600)
Shape of label tensor: (3095, 3)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (20000, 320)
../../data/tmp/hist_aware_pickle.pickle


### 1960-oil

In [17]:
data_fp = "1960s_oil_merged.csv"
output_fp='1960_oil_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 6281
Found 12939 unique tokens.
Shape of data tensor: (397, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (12940, 320)


### 1960-coal

In [18]:
data_fp = "1960s_coal_merged.csv"
output_fp='1960_coal_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 5838
Found 8330 unique tokens.
Shape of data tensor: (258, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (8331, 320)


### 1960-gas

In [19]:
data_fp = "1960s_gas_merged.csv"
output_fp='1960_gas_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 5986
Found 11257 unique tokens.
Shape of data tensor: (433, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (11258, 320)


### 1970-oil

In [17]:
data_fp = "1970s_oil_merged.csv"
output_fp='1970_oil_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

../../data/tmp
max length of all texts 11644
Found 14024 unique tokens.
Shape of data tensor: (500, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (14025, 320)


### 1970-coal

In [8]:
data_fp = "1970s_coal_merged.csv"
output_fp='1970_coal_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 5461
Found 7038 unique tokens.
Shape of data tensor: (344, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (7039, 320)


### 1970-gas

In [9]:
data_fp = "1970s_gas_merged.csv"
output_fp='1970_gas_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 5461
Found 6096 unique tokens.
Shape of data tensor: (179, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (6097, 320)


### 1980-oil

In [10]:
data_fp = "1980s_oil_merged.csv"
output_fp='1980_oil_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 6402
Found 12258 unique tokens.
Shape of data tensor: (484, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (12259, 320)


### 1980-coal

In [11]:
data_fp = "1980s_coal_merged.csv"
output_fp='1980_coal_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 6402
Found 7756 unique tokens.
Shape of data tensor: (325, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (7757, 320)


### 1980-gas

In [12]:
data_fp = "1980s_gas_merged.csv"
output_fp='1980_gas_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 5592
Found 10972 unique tokens.
Shape of data tensor: (511, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (10973, 320)


### 1990-oil

In [13]:
data_fp = "1990s_oil_merged.csv"
output_fp='1990_oil_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 5103
Found 5485 unique tokens.
Shape of data tensor: (193, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (5486, 320)


### 1990-coal

In [14]:
data_fp = "1990s_coal_merged.csv"
output_fp='1990_coal_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 5103
Found 7628 unique tokens.
Shape of data tensor: (337, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (7629, 320)


### 1990-gas

In [16]:
data_fp = "1990s_gas_merged.csv"
output_fp='1990_gas_hist_aware_pickle.pickle'
data_prep(data_fp,output_fp)

max length of all texts 5103
Found 5597 unique tokens.
Shape of data tensor: (222, 11600)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (5598, 320)


## Train model on the entire dataset

In [4]:
## Trained on the entire dataset
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='all'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims)

x_train shape: (2073, 11600) , x_val shape: (1022, 11600)
y_train shape: (2073, 3) , y_val shape: (1022, 3)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 11600, 320)   6400000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 11598, 128)   123008      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 11593, 128)   327808      embedding[0][0]     

### Confusion matrix

In [5]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[293,  38, 112],
       [ 48,  52,  60],
       [105,  39, 275]])

### Classification report

In [6]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.656951,0.6614,0.659168,443.0
sent_1,0.403101,0.325,0.359862,160.0
sent_2,0.615213,0.656325,0.635104,419.0
accuracy,0.606654,0.606654,0.606654,0.606654
macro avg,0.558421,0.547575,0.551378,1022.0
weighted avg,0.600097,0.606654,0.602444,1022.0


## Train model per decade/ per topic
### 1960-oil

In [4]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1960_oil'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1960_oil_hist_aware_pickle.pickle')

x_train shape: (265, 11600) , x_val shape: (132, 11600)
y_train shape: (265, 3) , y_val shape: (132, 3)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 11600, 320)   4140800     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 11598, 128)   123008      embedding_8[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 11593, 128)   327808      embedding_8[0][0]       

<cnn_model.CNN_Model at 0x7f225b1f28b0>

### Confusion matrix

In [5]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[11, 12, 11],
       [ 9, 13, 12],
       [12, 24, 28]])

### Classification report

In [6]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.34375,0.323529,0.333333,34.0
sent_1,0.265306,0.382353,0.313253,34.0
sent_2,0.54902,0.4375,0.486957,64.0
accuracy,0.393939,0.393939,0.393939,0.393939
macro avg,0.386025,0.381127,0.377848,132.0
weighted avg,0.423069,0.393939,0.402645,132.0


### 1960-coal

In [7]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1960_coal'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1960_coal_hist_aware_pickle.pickle')

x_train shape: (172, 11600) , x_val shape: (86, 11600)
y_train shape: (172, 3) , y_val shape: (86, 3)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 11600, 320)   2665920     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 11598, 128)   123008      embedding_9[0][0]                
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 11593, 128)   327808      embedding_9[0][0]       

<cnn_model.CNN_Model at 0x7f2259487430>

### Confusion matrix

In [8]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[ 1,  6, 12],
       [ 3,  6, 19],
       [ 1,  8, 30]])

### Classification report

In [9]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.2,0.052632,0.083333,19.0
sent_1,0.3,0.214286,0.25,28.0
sent_2,0.491803,0.769231,0.6,39.0
accuracy,0.430233,0.430233,0.430233,0.430233
macro avg,0.330601,0.345383,0.311111,86.0
weighted avg,0.364888,0.430233,0.371899,86.0


### 1960-gas

In [10]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1960_gas'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1960_gas_hist_aware_pickle.pickle')

x_train shape: (290, 11600) , x_val shape: (143, 11600)
y_train shape: (290, 3) , y_val shape: (143, 3)
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 11600, 320)   3602560     input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 11598, 128)   123008      embedding_10[0][0]               
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 11593, 128)   327808      embedding_10[0][0]    

<cnn_model.CNN_Model at 0x7f22fdd01160>

### Confusion matrix

In [11]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[ 2,  5, 19],
       [ 0,  6, 38],
       [ 7,  4, 62]])

### Classification report

In [12]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.222222,0.076923,0.114286,26.0
sent_1,0.4,0.136364,0.20339,44.0
sent_2,0.521008,0.849315,0.645833,73.0
accuracy,0.48951,0.48951,0.48951,0.48951
macro avg,0.381077,0.354201,0.32117,143.0
weighted avg,0.42945,0.48951,0.413052,143.0


### 1970-oil

In [20]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1970_oil'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1970_oil_hist_aware_pickle.pickle')

x_train shape: (335, 11600) , x_val shape: (165, 11600)
y_train shape: (335, 3) , y_val shape: (165, 3)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 11600, 320)   4488000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 11598, 128)   123008      embedding_5[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 11593, 128)   327808      embedding_5[0][0]       

<cnn_model.CNN_Model at 0x7fdc7e5bbfd0>

### Confusion matrix

In [21]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[65,  7, 14],
       [17, 11,  7],
       [21,  8, 15]])

### Classification report

In [22]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.631068,0.755814,0.687831,86.0
sent_1,0.423077,0.314286,0.360656,35.0
sent_2,0.416667,0.340909,0.375,44.0
accuracy,0.551515,0.551515,0.551515,0.551515
macro avg,0.490271,0.470336,0.474495,165.0
weighted avg,0.529775,0.551515,0.535008,165.0


### 1970-coal

In [23]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1970_coal'
model = train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1970_coal_hist_aware_pickle.pickle')

x_train shape: (230, 11600) , x_val shape: (114, 11600)
y_train shape: (230, 3) , y_val shape: (114, 3)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 11600, 320)   2252480     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 11598, 128)   123008      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 11593, 128)   327808      embedding[0][0]       

### Confusion matrix

In [24]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[ 1,  0, 15],
       [ 0,  1,  5],
       [ 1,  0, 91]])

### Classification report

In [25]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.5,0.0625,0.111111,16.0
sent_1,1.0,0.166667,0.285714,6.0
sent_2,0.81982,0.98913,0.896552,92.0
accuracy,0.815789,0.815789,0.815789,0.815789
macro avg,0.773273,0.406099,0.431126,114.0
weighted avg,0.784416,0.815789,0.754165,114.0


### 1970-gas

In [26]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1970_gas'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1970_gas_hist_aware_pickle.pickle')

x_train shape: (119, 11600) , x_val shape: (60, 11600)
y_train shape: (119, 3) , y_val shape: (60, 3)
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 11600, 320)   1951040     input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 11598, 128)   123008      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 11593, 128)   327808      embedding_1[0][0]       

<cnn_model.CNN_Model at 0x7fdc70d426a0>

### Confusion matrix

In [27]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[ 8,  2,  8],
       [ 1,  1,  5],
       [ 9,  1, 25]])

### Classification report

In [28]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.444444,0.444444,0.444444,18.0
sent_1,0.25,0.142857,0.181818,7.0
sent_2,0.657895,0.714286,0.684932,35.0
accuracy,0.566667,0.566667,0.566667,0.566667
macro avg,0.45078,0.433862,0.437065,60.0
weighted avg,0.546272,0.566667,0.554089,60.0


### 1980-oil

In [29]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1980_oil'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1980_oil_hist_aware_pickle.pickle')

x_train shape: (324, 11600) , x_val shape: (160, 11600)
y_train shape: (324, 3) , y_val shape: (160, 3)
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 11600, 320)   3922880     input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 11598, 128)   123008      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 11593, 128)   327808      embedding_2[0][0]     

<cnn_model.CNN_Model at 0x7fdc713a3df0>

### Confusion matrix

In [30]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[60, 10,  9],
       [22, 10,  5],
       [27,  2, 15]])

### Classification report

In [31]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.550459,0.759494,0.638298,79.0
sent_1,0.454545,0.27027,0.338983,37.0
sent_2,0.517241,0.340909,0.410959,44.0
accuracy,0.53125,0.53125,0.53125,0.53125
macro avg,0.507415,0.456891,0.462747,160.0
weighted avg,0.519144,0.53125,0.506563,160.0


### 1980-coal

In [32]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1980_coal'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1980_coal_hist_aware_pickle.pickle')

x_train shape: (217, 11600) , x_val shape: (108, 11600)
y_train shape: (217, 3) , y_val shape: (108, 3)
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 11600, 320)   2482240     input_5[0][0]                    
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 11598, 128)   123008      embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 11593, 128)   327808      embedding_3[0][0]     

<cnn_model.CNN_Model at 0x7fdc6f922850>

### Confusion matrix

In [33]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[11,  5, 13],
       [ 3,  3,  7],
       [ 9,  4, 53]])

### Classification report

In [34]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.478261,0.37931,0.423077,29.0
sent_1,0.25,0.230769,0.24,13.0
sent_2,0.726027,0.80303,0.76259,66.0
accuracy,0.62037,0.62037,0.62037,0.62037
macro avg,0.484763,0.471037,0.475222,108.0
weighted avg,0.602198,0.62037,0.60852,108.0


### 1980-gas

In [35]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1980_gas'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1980_gas_hist_aware_pickle.pickle')

x_train shape: (342, 11600) , x_val shape: (169, 11600)
y_train shape: (342, 3) , y_val shape: (169, 3)
Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 11600, 320)   3511360     input_6[0][0]                    
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 11598, 128)   123008      embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 11593, 128)   327808      embedding_4[0][0]     

<cnn_model.CNN_Model at 0x7fdc6f922430>

### Confusion matrix

In [36]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[23, 10, 29],
       [ 2,  7, 14],
       [ 6,  9, 69]])

### Classification report

In [37]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.741935,0.370968,0.494624,62.0
sent_1,0.269231,0.304348,0.285714,23.0
sent_2,0.616071,0.821429,0.704082,84.0
accuracy,0.585799,0.585799,0.585799,0.585799
macro avg,0.542413,0.498915,0.494807,169.0
weighted avg,0.615043,0.585799,0.570301,169.0


### 1990-oil

In [38]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1990_oil'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1990_oil_hist_aware_pickle.pickle')

x_train shape: (129, 11600) , x_val shape: (64, 11600)
y_train shape: (129, 3) , y_val shape: (64, 3)
Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 11600, 320)   1755520     input_7[0][0]                    
__________________________________________________________________________________________________
conv1d_12 (Conv1D)              (None, 11598, 128)   123008      embedding_5[0][0]                
__________________________________________________________________________________________________
conv1d_13 (Conv1D)              (None, 11593, 128)   327808      embedding_5[0][0]       

<cnn_model.CNN_Model at 0x7fdbe8046520>

### Confusion matrix

In [39]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[38,  3,  1],
       [11,  3,  0],
       [ 7,  0,  1]])

### Classification report

In [40]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.678571,0.904762,0.77551,42.0
sent_1,0.5,0.214286,0.3,14.0
sent_2,0.5,0.125,0.2,8.0
accuracy,0.65625,0.65625,0.65625,0.65625
macro avg,0.559524,0.414683,0.42517,64.0
weighted avg,0.617188,0.65625,0.599554,64.0


### 1990-coal

In [41]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1990_coal'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1990_coal_hist_aware_pickle.pickle')

x_train shape: (225, 11600) , x_val shape: (112, 11600)
y_train shape: (225, 3) , y_val shape: (112, 3)
Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 11600, 320)   2441280     input_8[0][0]                    
__________________________________________________________________________________________________
conv1d_14 (Conv1D)              (None, 11598, 128)   123008      embedding_6[0][0]                
__________________________________________________________________________________________________
conv1d_15 (Conv1D)              (None, 11593, 128)   327808      embedding_6[0][0]     

<cnn_model.CNN_Model at 0x7fdbe066d3a0>

### Confusion matrix

In [42]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[79,  0,  2],
       [10,  0,  2],
       [14,  0,  5]])

### Classification report

In [43]:
get_classification_report(results['y_val'], results['pred'])

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
sent_0,0.76699,0.975309,0.858696,81.0
sent_1,0.0,0.0,0.0,12.0
sent_2,0.555556,0.263158,0.357143,19.0
accuracy,0.75,0.75,0.75,0.75
macro avg,0.440849,0.412822,0.40528,112.0
weighted avg,0.648944,0.75,0.681608,112.0


### 1990-gas

In [44]:
dropout = (0.5,0.8)
optimizer = 'adam'
batch_size = 16
epoch_no = 100
hidden_dims=50
model_fp_suff='1990_gas'
train_model(dropout, optimizer, batch_size, epoch_no, hidden_dims,model_fp_suff=model_fp_suff, pkl_fp='1990_gas_hist_aware_pickle.pickle')

x_train shape: (148, 11600) , x_val shape: (74, 11600)
y_train shape: (148, 3) , y_val shape: (74, 3)
Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 11600)]      0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 11600, 320)   1791360     input_9[0][0]                    
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 11598, 128)   123008      embedding_7[0][0]                
__________________________________________________________________________________________________
conv1d_17 (Conv1D)              (None, 11593, 128)   327808      embedding_7[0][0]       

<cnn_model.CNN_Model at 0x7fdbe0462b50>

### Confusion matrix

In [45]:
export_path = os.path.join(OUTPUT_DIR,'dropout{}_optimizer_{}_batch_size{}_epoch_no{}_hidden_dims{}_{}.csv'.format(dropout, optimizer,batch_size, epoch_no, hidden_dims,model_fp_suff))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[29,  1,  4],
       [ 8,  2,  3],
       [13,  1, 13]])

### Classification report

In [46]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.58,0.852941,0.690476,34.0
sent_1,0.5,0.153846,0.235294,13.0
sent_2,0.65,0.481481,0.553191,27.0
accuracy,0.594595,0.594595,0.594595,0.594595
macro avg,0.576667,0.49609,0.492987,74.0
weighted avg,0.591486,0.594595,0.560421,74.0
