In [1]:
#Skripte laden
%reload_ext autoreload
%autoreload 2
import os
os.chdir('/home/safiul/email-classification')
import data
from Email_Classification.Functions.packages import *
import Email_Classification.Functions.Functions as fn
import Email_Classification.Functions.Helper as hp
import Email_Classification.Functions.parameters as par

In [3]:
#Parameterwerte
param_process = par.params['param_process']
param_nlp = par.params['param_nlp']
param_embed = par.params['param_embed']
param_conv = par.params['param_conv']
param_optimizer = par.params['param_optimizer']
param_fit = par.params['param_fit']
prior_params = par.params['prior_params']

In [None]:
%%time
#Dateien laden
headers = pd.read_parquet(data.HEADERS_FILE)
bodies = pd.read_parquet(data.BODIES_FILE)
targets = features.zedsets.prepare_target_most_common(
    pd.read_parquet(data.TARGETS_FILE),
    target_column='target_category',
    num_most_common_class=100
)
emails = fn.createDataFrameEmail(headers = headers, 
                                 bodies = bodies, 
                                 targets = targets, 
                                 sample_size = param_process['sample_size'], 
                                 seed = 125)
del headers, bodies, targets

In [None]:
%%time
#Der Datensatz auf Test- und Trainingsmenge aufteilenX_train, X_test, y_train, y_test = \
model_selection.train_test_split(emails.emails, 
                                 emails.target_category, 
                                 test_size =  param_process['test_size'], 
                                 shuffle = True,
                                 random_state = 123)
#Konvertieren von kategorialen Labels in Zahlen
LE = preprocessing.LabelEncoder()
LE.fit(emails.target_category)
y_train = LE.transform(y_train)
y_test = LE.transform(y_test)
encoder = LabelBinarizer()
encoder_fit = encoder.fit(y_train)
y_train = encoder_fit.transform(y_train)
y_test = encoder_fit.transform(y_test)
del emails

In [None]:
%%time
#Datenvorverarbeiten
TP = fn.Text_process(nchar = param_process['nchar'], 
                     rmDigits = param_process['rmDigits'], 
                     trans_lower= param_process['trans_lower'])
TP.fit(X_train)
X_train_TP = fn.multiprocess_array(TP, X_train)
X_test_TP = fn.multiprocess_array(TP, X_test)
X_train_TP = pd.concat(X_train_TP)
X_test_TP = pd.concat(X_test_TP)

X_train_TP_ = X_train_TP[X_train_TP.apply(lambda x: len(x.split())) != 0]
X_test_TP_ = X_test_TP[X_test_TP.apply(lambda x: len(x.split())) != 0]
y_train = y_train[X_train_TP.apply(lambda x: len(x.split())) != 0]
y_test = y_test[X_test_TP.apply(lambda x: len(x.split())) != 0]
del X_train, X_test, X_train_TP, X_test_TP

In [None]:
%%time
#tokenize + padding 
Embeddings = fn.Embedding_custom(max_feature = param_embed['input_dim'],
                           X_train= X_train_TP_,
                           X_test= X_test_TP_,
                           quantile = param_embed['quantile'])
MAXLEN, tokenizer, X_train_vec, X_test_vec =  Embeddings.padding()
#Embedding-Matrix erstellen
fb_model = load_facebook_model("fasttext/cc.de.300.bin")
embedding_matrix = Embeddings.embedding_matrix(fb_model)
#with open('Email_Classification/Models/Sequential Models/rnn/lstm/Models/\
#2020-10-05_1932_acc_0.6889/feature/embedding_matrix.npy', 'rb') as f:
#    embedding_matrix = np.load(f)
del fb_model

In [5]:
%%time
#CNN-Modelle initialisieren
CNN = fn.CNN_custom(param_embed = param_embed, 
               param_conv = param_conv,  
               prior_params = prior_params,
               param_optimizer = param_optimizer, 
               param_fit = param_fit,
               run_vi = True,
               embedding_matrix = embedding_matrix, 
               training = False, 
               MAXLEN = 396,
               train_size = len(X_train_vec))

In [7]:
#Callbacks erstellen
now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M")
wkd = param_fit['wkd']
PATH_CHECKPOINTS = wkd + now
if not os.path.isdir(PATH_CHECKPOINTS):
    os.mkdir(PATH_CHECKPOINTS)
    
# Create callbacks
callbacks = [
    EarlyStopping(monitor= param_fit['monitor'], 
                     patience=  param_fit['patience'], 
                     min_delta= param_fit['min_delta'], 
                     mode= param_fit['mode']
),
    ModelCheckpoint(
        #see https://machinelearningmastery.com/check-point-deep-learning-models-keras/
        PATH_CHECKPOINTS +  "/weights.best.hdf5",
        monitor=  param_fit['monitor'],
        mode = param_fit['mode'],
        save_best_only= param_fit['save_best_only'],
        verbose= param_fit['verbose']
    )
]

In [None]:
%%time
# Modell fitten
steps_per_epoch = int(np.floor((len(X_train_vec) / param_fit['batch_size'])))
print(f"Model Params.\nbatch_size: {param_fit['batch_size']}\nEpochs: {param_fit['epochs']}\n"
    f"Step p. Epoch: {steps_per_epoch}\n"
)

story = CNN.fit(
    X_train_vec,
    y_train,
    batch_size = param_fit['batch_size'],
    epochs= param_fit['epochs'],
    steps_per_epoch=steps_per_epoch,
    callbacks=callbacks,
    validation_data=(X_test_vec, y_test)
)

In [None]:
#Das Mosell mit den besten Gewichte aktualisieren
CNN.load_weights('Email_Classification/Models/Sequential Models/rnn/lstm/Checkpoint/2020-12-01_0124/weights.best.hdf5')
score = CNN.evaluate(X_test_vec, 
                      y_test, 
                      verbose=1)
print('Test loss:    ', score[0])
print('Test accuracy:', score[1])

In [None]:
#Modell speichern
path = 'Email_Classification/Models/Sequential Models/cnn/model/uncertainty/VI/phi_0.5_sig1_1.5_sig2_0.5'
CNN.save(path)

In [15]:
#Trainings- und Testmenge und Hyperparameterwerte speichern
with open('Email_Classification/Models/Sequential Models/cnn/model/uncertainty/VI/phi_0.5_sig1_1.5_sig2_0.5/feature/X_train_vec.npy', 'wb') as f:
    np.save(f, X_train_vec)
with open('Email_Classification/Models/Sequential Models/cnn/model/uncertainty/VI/phi_0.5_sig1_1.5_sig2_0.5/feature/X_test_vec.npy', 'wb') as f:
    np.save(f, X_test_vec)
with open('Email_Classification/Models/Sequential Models/cnn/model/uncertainty/VI/phi_0.5_sig1_1.5_sig2_0.5/feature/y_train.npy', 'wb') as f:
    np.save(f, y_train)
with open('Email_Classification/Models/Sequential Models/cnn/model/uncertainty/VI/phi_0.5_sig1_1.5_sig2_0.5/feature/y_test.npy', 'wb') as f:
    np.save(f, y_test)
with open('Email_Classification/Models/Sequential Models/cnn/model/uncertainty/VI/phi_0.5_sig1_1.5_sig2_0.5/params.pkl', 'wb') as f:
    pickle.dump(f, params)