In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from text_manager import TextManager
from embedding import Word2VecEmbedding
from lstm_model import LSTM_Model
from cnn_model import CNN_Model

In [2]:
TEMP_DATA_DIR = '../../data/tmp'
EMBEDDING_PATH = "../../data/320/combined-320.txt"
OUTPUT_DIR = '../../data/output'
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 10000

data_fp = "../../data/labeled/labeled_energy_1970_1990.csv"

In [3]:

def load_dataset(data_fp):
    """Load dataset 
    """

    # read the data of the file location given as argument to this function
    df = pd.read_csv(data_fp)

    # make texts and labels
    texts = df['text'].fillna('')
    labels = df["labels"]

    return texts.values, labels.values


def data_prep(data_fp):


    # Dataset, labels and embedding layer are stored to disk in pickle file. 
    if not os.path.exists(TEMP_DATA_DIR):
        os.makedirs(TEMP_DATA_DIR)
        
    pickle_fp = os.path.join(TEMP_DATA_DIR, 'hist_aware_pickle.pickle')

    # load the dataset from disk
    texts, lbls = load_dataset(data_fp)
    
    
    
    # get the texts and their corresponding labels

    textManager = TextManager(
        max_num_words = MAX_NUM_WORDS,
        max_sequence_length = MAX_SEQUENCE_LENGTH
    )
    
    texts = textManager.clean_text(texts)
    print('max length of all texts', len(max(texts, key=len)))
    
    data, labels, word_index = textManager.sequence_maker(texts, lbls)

    if not os.path.exists(TEMP_DATA_DIR):
        os.makedirs(TEMP_DATA_DIR)

    embedding = Word2VecEmbedding(word_index, MAX_NUM_WORDS,
                                  MAX_SEQUENCE_LENGTH)
    embedding.load_word2vec_data(EMBEDDING_PATH)
    embedding_layer = embedding.build_embedding()

    with open(pickle_fp, 'wb') as f:
        pickle.dump((data, labels, embedding_layer), f)

def train_model(model, dropout=0):

        """ Read dataset, labels and embedding layer from pickle file. """
        pickle_fp = os.path.join(TEMP_DATA_DIR, 'hist_aware_pickle.pickle')

        with open(pickle_fp, 'rb') as f:
            data, labels, embedding_layer = pickle.load(f)
    

        """ Split dataset to train and test """
        x_train, x_val, y_train, y_val = train_test_split(data, labels,
                                                    test_size=0.33,
                                                    random_state=0,
                                                    stratify=labels)
            
        print("x_train shape:", x_train.shape, ", x_val shape:", x_val.shape)
        print("y_train shape:", y_train.shape, ", y_val shape:", y_val.shape)
       
        if model == 'lstm':
            """ Make a lstm model """
            deep_model = LSTM_Model
            args_model = {
                'backwards': True,
                'dropout': dropout,
                'optimizer': 'rmsprop',
                'max_sequence_length': MAX_SEQUENCE_LENGTH,
                'embedding_layer': embedding_layer
            }
        elif model == 'cnn':
            """ Make a cnn model """
            deep_model = CNN_Model
            args_model = {
                'optimizer': 'rmsprop',
                'max_sequence_length': MAX_SEQUENCE_LENGTH,
                'embedding_layer': embedding_layer
            }
            
        """ Train model, calculate scores"""
        model = deep_model(**args_model)
        model.train(x_train, y_train, x_val, y_val)

        

        pred = model.predict(x_val)
        
         # store result in dataframe
        df_y = pd.DataFrame({'sent_0': y_val[:,0],'sent_1': y_val[:,1],'sent_2': y_val[:,2]})
        df_pred = pd.DataFrame({'sent_0': pred[:,0],'sent_1': pred[:,1],'sent_2': pred[:,2]})
        
        result_df = pd.concat([df_y.idxmax(axis=1),df_pred.idxmax(axis=1)], axis=1)
        result_df.columns =['y_val','pred']
        

        """Save the result to a file"""
        
        if not os.path.exists(OUTPUT_DIR):
                os.makedirs(OUTPUT_DIR)
        export_path = os.path.join(OUTPUT_DIR, 'dropout{}.csv'.format(dropout))
        result_df.to_csv(export_path,index=False)
        df_pred.to_csv(os.path.join(OUTPUT_DIR,'preds_prob.csv'))



In [None]:
def grid_search():
    # create model
    model = Keras Classifier(build_fn=create_model, verbose=0)
    # define the grid search parameters
    batch_size = [10, 20, 40, 60, 80, 100]
    epochs = [10, 50, 100]
    param_grid = dict(batch_size=batch_size, epochs=epochs)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
    grid_result = grid.fit(X, Y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

In [4]:

dropout = 0.2
data_prep(data_fp)
#train_lstm(dropout)

max length of all texts 11644
Found 31517 unique tokens.
Shape of data tensor: (6214, 10000)
Shape of label tensor: (6214, 3)
Indexing word vectors.
Found 1442951 word vectors.
Shape of embedding matrix:  (20000, 320)


In [None]:
dropout = 0.2
train_model('lstm',dropout)

x_train shape: (4163, 10000) , x_val shape: (2051, 10000)
y_train shape: (4163, 3) , y_val shape: (2051, 3)
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 10000)]           0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10000, 320)        6400000   
_________________________________________________________________
lstm (LSTM)                  (None, 10)                13240     
_________________________________________________________________
dense (Dense)                (None, 128)               1408      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 387       
Total params: 6,415,035
Trainable params: 15,035
Non-trainable params: 6,400,000
_______________________________________________________________

In [4]:
dropout = 0.0
train_model('cnn',dropout)

In [5]:
from sklearn.metrics import confusion_matrix

export_path = os.path.join(OUTPUT_DIR, 'dropout{}.csv'.format(dropout))
results = pd.read_csv(export_path)

confusion_matrix(results['y_val'], results['pred'])

array([[490,  86,  41],
       [163, 789,  97],
       [102, 104, 179]])

In [8]:
from sklearn.metrics import confusion_matrix, classification_report, recall_score, accuracy_score

def get_classification_report(y_test, preds):
    cr = classification_report(y_test, preds , output_dict=True)
    return pd.DataFrame(cr).transpose()

In [9]:
get_classification_report(results['y_val'], results['pred'])

Unnamed: 0,precision,recall,f1-score,support
sent_0,0.649007,0.794165,0.714286,617.0
sent_1,0.805924,0.752145,0.778107,1049.0
sent_2,0.564669,0.464935,0.509972,385.0
accuracy,0.710873,0.710873,0.710873,0.710873
macro avg,0.6732,0.670415,0.667455,2051.0
weighted avg,0.713432,0.710873,0.708575,2051.0


In [30]:
data = pd.read_csv(data_fp)
data.labels.value_counts()

1    3179
0    1868
2    1167
Name: labels, dtype: int64

In [1]:
export_path = os.path.join(OUTPUT_DIR, 'preds_prob.csv'.format(dropout))
probs = pd.read_csv(export_path)

probs


NameError: name 'os' is not defined

In [17]:
import numpy as np
a = np.array(['hello','world','!','Oooh gaaah booo gaah?'])
max(a, key=len)
len(max(a, key=len))

21