In [5]:
# COMP90051 Project 1 source code
# For Team 192

import numpy as np
import pandas as pd

from scipy import stats
from sklearn import preprocessing

# self-made external scripts
import ext_scipy
import ext_preprocess

import pickle

import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [6]:
# from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

def load_dataset(col_name):
    df_X_train = pd.read_csv('cleaned_train.csv')
    df_X_test = pd.read_csv('cleaned_test.csv')
    
    all_labels = np.sort(df_X_train['label'].unique())

    #df_X_train = df_X_train[df_X_train[col_name].notna()]
    df_X_train[col_name] = df_X_train[col_name].fillna('')
    df_X_train, df_X_validation = train_test_split(df_X_train,
                                                   test_size=config['VALIDATION_SIZE'],
                                                   random_state=config['RAND_STATE_NUM'])

    X_train, y_train = df_X_train[col_name], df_X_train['label']
    X_validation, y_validation = df_X_validation[col_name], df_X_validation['label']

    na_spots = df_X_test[col_name].isna()
    df_X_test[col_name][na_spots] = df_X_test['tweet'][na_spots]
    X_test = df_X_test[col_name]
    
    return X_train, y_train, X_validation, y_validation, X_test, all_labels

In [7]:
config = {
    'RAND_STATE_NUM': 5354,
    'VERSION': 0,
    'VALIDATION_SIZE': 0.0345
}


# ngram

In [None]:
X_train, y_train, X_validation, y_validation, X_test, all_labels = load_dataset('tweet')

In [9]:
# Character n-grams
from sklearn.feature_extraction.text import CountVectorizer

N_GRAM_LEN = 3
cv = CountVectorizer(ngram_range=(N_GRAM_LEN, N_GRAM_LEN), lowercase=False, analyzer='char_wb')

In [None]:
cv.fit(X_train)

In [11]:
vocab_size_ngram = len(cv.vocabulary_) + 1
print(vocab_size_ngram)

246688


In [12]:
def map_ints(tweet):
    return np.array([k for k in map(cv.vocabulary_.get, cv.build_analyzer()(tweet)) if k is not None]) + 1

X_train_ngram = X_train.apply(map_ints)
print("Done with train_convolve")
X_validation_ngram = X_validation.apply(map_ints)
X_test_ngram = X_test.apply(map_ints)

Done with train_convolve


## Keras

In [None]:
import keras

from keras.models import Sequential, Model
from keras import layers
from keras.layers.merge import concatenate
from keras.wrappers.scikit_learn import KerasClassifier

from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer


def pad_set(train, vald, test):
    maxlen = max([len(x) for x in train]) + 5
    return pad_sequences(train, padding='post', maxlen=maxlen), pad_sequences(vald, padding='post', maxlen=maxlen), pad_sequences(test, padding='post', maxlen=maxlen), maxlen

In [14]:
X_train_ngram, X_validation_ngram, X_test_ngram, maxlen_ngram = pad_set(X_train_ngram, X_validation_ngram, X_test_ngram)

In [15]:
encoder = LabelEncoder()
encoder.fit(all_labels)

y_train_enc = np_utils.to_categorical(encoder.transform(y_train), num_classes=encoder.classes_.shape[0])
y_validation_enc = np_utils.to_categorical(encoder.transform(y_validation), num_classes=encoder.classes_.shape[0])

### Optional: Prepare embedding for cleaned_v3

In [None]:
X_train, y_train, X_validation, y_validation, X_test, _ = load_dataset('cleaned_v3')

tokenizer = Tokenizer() # num_words=config['TOKEN_WORDS'])
tokenizer.fit_on_texts(X_train)


In [17]:
X_train_all = tokenizer.texts_to_sequences(X_train)
X_validation_all = tokenizer.texts_to_sequences(X_validation)
X_test_all = tokenizer.texts_to_sequences(X_test)

vocab_size_all = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

In [18]:
X_train_all, X_validation_all, X_test_all, maxlen_all = pad_set(X_train_all, X_validation_all, X_test_all)

In [42]:
X_train_ngram.shape, maxlen_ngram

((317583, 42), 42)

In [19]:
X_train_all.shape, maxlen_all

((316681, 72), 72)

# Choose your model

In [None]:
embedding_dim = 200

input1 = layers.Input(shape=(maxlen_ngram,))
em_layer = layers.Embedding(vocab_size_ngram, embedding_dim)(input1)

CNN_SETTINGS = [[384, 3, 3]]

for n_filters, kernel_width, pool_size in CNN_SETTINGS:
    f = layers.Conv1D(n_filters, kernel_width, activation='relu')(em_layer)
    f = layers.BatchNormalization()(f)
    f = layers.SpatialDropout1D(0.15)(f)
    f = layers.AveragePooling1D(pool_size)(f)
   # f = layers.MaxPooling1D(pool_size)(f)

#f = layers.AveragePooling1D(pool_size)(f)
f = layers.GlobalMaxPooling1D()(f)
#f = layers.Dense(128, activation='relu')(f)
f = layers.Dense(all_labels.shape[0], activation='softmax')(f)

model = Model(inputs=input1, outputs=f)
opt = keras.optimizers.Nadam()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [33]:
embedding_dim = 120

input1 = layers.Input(shape=(maxlen_ngram,))
em_layer = layers.Embedding(vocab_size_ngram, embedding_dim)(input1)

f1 = layers.Conv1D(192, 2, activation='relu')(em_layer)
f1 = layers.MaxPooling1D()(f1)
f1 = layers.GlobalMaxPooling1D()(f1)

f2 = layers.Conv1D(192, 3, activation='relu')(em_layer)
f2 = layers.MaxPooling1D()(f2)
f2 = layers.GlobalMaxPooling1D()(f2)

f3 = layers.Conv1D(192, 4, activation='relu')(em_layer)
f3 = layers.MaxPooling1D()(f3)
f3 = layers.GlobalMaxPooling1D()(f3)

f = layers.Concatenate(axis=1)([f1, f2, f3])
#f = layers.Flatten()(mrg)
#f = layers.Dense(128, activation='relu')(f)
f = layers.Dense(all_labels.shape[0], activation='softmax')(f)

model = Model(inputs=input1, outputs=f)
opt = keras.optimizers.Nadam()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history2 = model.fit([X_train_all, X_train_ngram], y_train_enc,
                    epochs=4, batch_size=200,
                    validation_data=([X_validation_all, X_validation_ngram], y_validation_enc))

In [None]:
history = model.fit(X_train_ngram, y_train_enc,
                    epochs=4, batch_size=300,
                    validation_data=(X_validation_ngram, y_validation_enc))

In [15]:
from keras.utils import plot_model
import json

plot_model(model, to_file='./keras_out/model.png')
json.dump(history.history, open('./keras_out/history.json', 'w'))

In [None]:
import json
def save_keras_results(name):
    model.save(name + ".h5")
    json.dump(history.history, open(name + ' [HISTORY].json', 'w'))
    json.dump(config, open(name + ' [CONFIG].json', 'w'))
    
save_keras_results(config['MODEL_NAME'] + "_v%d" % (config['VERSION']))
config['VERSION'] += 1

In [35]:
keras.backend.clear_session() # to start again

In [None]:
sample = np.random.choice(X_train_embed.shape[0], 100000, replace=False)

#model.evaluate(x=X_train_embed[sample], y=y_train_enc[sample])
model.evaluate(x=X_validation_embed, y=y_validation_enc)

In [17]:
#predictions_pre = model.predict_classes(X_test_embed)
predictions_pre = np.argmax(model.predict(X_test_ngram), axis=-1)

prediction_ = np.argmax(np_utils.to_categorical(predictions_pre), axis = 1)
prediction_ = encoder.inverse_transform(prediction_)

predictions = prediction_

In [18]:
ext_scipy.save_predictions(predictions)