In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import pickle
import glob
from sklearn.metrics import confusion_matrix, classification_report, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from text_manager import TextManager
from embedding import Word2VecEmbedding
from cnn_model import CNN_Model
import numpy as np

In [2]:
MODEL_DIR = '../../output/cnn_model'
TXT_DIR = '../../data/selected'
TEMP_DATA_DIR = '../../output/tmp'
PRED_DIR = '../../output/predictions'
MAX_SEQUENCE_LENGTH = 11600

In [3]:
def data_prep(data_fp,tokenz_fp, output_fp='text_to_pred.pickle'):
    
    output_fp = os.path.join(TEMP_DATA_DIR,output_fp)
    if not os.path.exists(TEMP_DATA_DIR):
        os.makedirs(TEMP_DATA_DIR)
       
    data_fp = os.path.join(TXT_DIR,data_fp)
    df = pd.read_csv(data_fp)
    
    # read tokenizer/word-index from trained data
    pickle_fp = os.path.join(TEMP_DATA_DIR,tokenz_fp)
    with open(pickle_fp, 'rb') as f:
        (tokenizer,word_index) = pickle.load(f)

    # get the texts
    textManager = TextManager(max_sequence_length=MAX_SEQUENCE_LENGTH)
    
    texts = textManager.clean_text(df['text_clean'])
    print('max length of all texts', len(max(texts, key=len)))
    data = textManager.sequence_maker(tokenizer,texts)
    
    with open(output_fp, 'wb') as f:
        pickle.dump(data, f)

def predict(model_fp,pickle_fp):
     
    model_fp = os.path.join(MODEL_DIR,model_fp) 
    pickle_fp = os.path.join(TEMP_DATA_DIR,pickle_fp)
    
    with open(pickle_fp, 'rb') as f:
            features = pickle.load(f)

    deep_model = CNN_Model    
    model = deep_model()
    preds = model.predict_model( model_fp, features)
    
    if not os.path.exists(PRED_DIR):
        os.makedirs(PRED_DIR)
    export_path = os.path.join(PRED_DIR, '{}.txt'.format(model_fp_suff))
    np.savetxt(export_path, preds, fmt="%s")

## Data preparation

### 1960-oil

In [6]:
data_fp = "1960/1960s_oil_labeled_full_0.95.csv"
output_fp="1960_oil_text_to_pred.pickle"
tokenz_fp = "tokenz_1960_oil_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 11929
Shape of data tensor: (57196, 11600)


### 1960-coal

In [9]:
data_fp = "1960/1960s_coal_labeled_full_0.95.csv"
output_fp="1960_coal_text_to_pred.pickle"
tokenz_fp = "tokenz_1960_coal_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 10579
Shape of data tensor: (4626, 11600)


### 1960-gas

In [11]:
data_fp = "1960/1960s_gas_labeled_full_0.95.csv"
output_fp="1960_gas_text_to_pred.pickle"
tokenz_fp = "tokenz_1960_gas_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 9599
Shape of data tensor: (40816, 11600)


### 1970-oil

In [19]:
data_fp = "1970/1970s_oil_labeled_full_0.95.csv"
output_fp="1970_oil_text_to_pred.pickle"
tokenz_fp = "tokenz_1970_oil_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 11252
Shape of data tensor: (96189, 11600)


### 1970_coal

In [4]:
data_fp = "1970/1970s_coal_labeled_full_0.95.csv"
output_fp="1970_coal_text_to_pred.pickle"
tokenz_fp = "tokenz_1970_coal_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 7489
Shape of data tensor: (5388, 11600)


### 1970_gas

In [4]:
data_fp = "1970/1970s_gas_labeled_full_0.95.csv"
output_fp="1970_gas_text_to_pred.pickle"
tokenz_fp = "tokenz_1970_gas_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 11988
Shape of data tensor: (51678, 11600)


### 1980_oil

In [4]:
data_fp = "1980/1980s_oil_labeled_full_0.95.csv"
output_fp="1980_oil_text_to_pred.pickle"
tokenz_fp = "tokenz_1980_oil_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 8916
Shape of data tensor: (1474, 11600)


### 1980_coal

In [5]:
data_fp = "1980/1980s_coal_labeled_full_0.95.csv"
output_fp="1980_coal_text_to_pred.pickle"
tokenz_fp = "tokenz_1980_coal_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 6627
Shape of data tensor: (29289, 11600)


### 1980_gas

In [5]:
data_fp = "1980/1980s_gas_labeled_full_0.95.csv"
output_fp="1980_gas_text_to_pred.pickle"
tokenz_fp = "tokenz_1980_gas_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 7632
Shape of data tensor: (94515, 11600)


### 1990_oil

In [None]:
data_fp = "1990/1990s_oil_labeled_full_0.95.csv"
output_fp="1990_oil_text_to_pred.pickle"
tokenz_fp = "tokenz_1990_oil_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 15085


### 1990_coal

In [7]:
data_fp = "1990/1990s_coal_labeled_full_0.95.csv"
output_fp="1990_coal_text_to_pred.pickle"
tokenz_fp = "tokenz_1990_coal_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 6712
Shape of data tensor: (2259, 11600)


### 1990_gas

In [13]:
data_fp = "1990/1990s_gas_labeled_full_0.95.csv"
output_fp="1990_gas_text_to_pred.pickle"
tokenz_fp = "tokenz_1990_gas_hist_aware_pickle.pickle"
data_prep(data_fp,tokenz_fp, output_fp)

max length of all texts 8308
Shape of data tensor: (16127, 11600)


## Predictions

In [None]:
model_fp_suff = '1960_oil'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1960_oil.h5"
pickle_fp="1960_oil_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [15]:
model_fp_suff = '1960_coal'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1960_coal.h5"
pickle_fp="1960_coal_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [None]:
model_fp_suff = '1960_gas'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1960_gas.h5"
pickle_fp="1960_gas_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [25]:
model_fp_suff = '1970_oil'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1970_oil"
pickle_fp="1970_oil_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [4]:

model_fp_suff = '1970_coal'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1970_coal.h5"
pickle_fp="1970_coal_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [9]:
model_fp_suff = '1970_gas'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1970_gas.h5"
pickle_fp="1970_gas_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [10]:
model_fp_suff = '1980_oil'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1980_oil.h5"
pickle_fp="1980_oil_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [11]:
model_fp_suff = '1980_coal'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1980_coal.h5"
pickle_fp="1980_coal_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [12]:
model_fp_suff = '1980_gas'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1980_gas.h5"
pickle_fp="1980_gas_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [None]:
model_fp_suff = '1990_oil'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1990_oil.h5"
pickle_fp="1990_oil_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [None]:
model_fp_suff = '1990_coal'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1990_coal.h5"
pickle_fp="1990_coal_text_to_pred.pickle"
predict(model_fp,pickle_fp)

In [None]:
model_fp_suff = '1990_gas'
model_fp= "dropout(0.5, 0.8)_optimizer_adam_batch_size16_epoch_no100_hidden_dims50_1990_gas.h5"
pickle_fp="1990_gas_text_to_pred.pickle"
predict(model_fp,pickle_fp)