In [1]:
import fasttext
import math
import re
import pandas as pd
import random as rd
import nltk
import pickle
import itertools
import numpy as np
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
from collections import Counter
import matplotlib.pyplot as plt
import os

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Word2vec
import gensim

# Keras
#from keras.models import Sequential
#from keras.layers import Dense, Conv1D, Flatten, Activation, MaxPooling1D, Dropout, LSTM

from __future__ import print_function
from tensorflow import keras
from tensorflow.keras.layers import Dense, Conv2D, BatchNormalization, Activation
from tensorflow.keras.layers import AveragePooling2D, Input, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import cifar100
from tensorflow.keras.models import model_from_json

In [2]:
# nltk.download('stopwords')
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8
# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024
# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)
# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [3]:
df = pd.read_csv(r"../../../Datasets/sentiment140_tweet.csv", encoding =DATASET_ENCODING , names=DATASET_COLUMNS)
df.head()
print("Dataset size:", len(df))

Dataset size: 1600000


In [4]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

In [5]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))
#documents = [_text.split() for _text in df_train.text] 

TRAIN size: 1280000
TEST size: 320000


In [6]:
train_size = 30000
test_size = 10000

df = pd.DataFrame(None)

texts_train = df_train["text"][:train_size].tolist()
ydata_train = df_train["target"][:train_size].tolist()
df_train = pd.DataFrame(None)

texts_test = df_test["text"][:test_size].tolist()
ydata_test = df_test["target"][:test_size].tolist()
df_test = pd.DataFrame(None)


local_model_file = "../../models/"
external_hdd = "/media/omer/Seagate Backup Plus Drive/OMER/FastText Models/"
metal_flash = "/media/omer/UBUNTU 20_0/NOVA/models/"

model = fasttext.load_model(local_model_file + "inpsmt_e15_d200t200.bin")

def one_hot_encode(alist):
    ret = []
    for value in alist:
        vec = [0, 0, 0, 0, 0]
        vec[value] = 1
        ret.append(vec)
    return ret

# Turn text into vectors
xdata_train, xdata_test = [], []
for text in texts_train:
    xdata_train.append(model.get_sentence_vector(text))
    
for text in texts_test:
    xdata_test.append(model.get_sentence_vector(text))
    
x_train = np.array(xdata_train)
y_train = np.array(one_hot_encode(ydata_train))

x_test = np.array(xdata_test)
y_test = np.array(one_hot_encode(ydata_test))

del xdata_train[:]
del xdata_test[:]
del ydata_train[:]
del ydata_test[:]



In [7]:
x_train = np.reshape(x_train,(-1,x_train.shape[1], 1))
x_test = np.reshape(x_test,(-1, x_test.shape[1], 1))

In [8]:
# Training parameters
batch_size = 128  # orig paper trained all networks with batch_size=128
epochs = 10
data_augmentation = False # data_augmentation=True is not tested for this notebook.
num_classes = 5

In [9]:
subtract_pixel_mean = True
n = 3
# Model version
# Orig paper: version = 1 (ResNet v1), Improved ResNet: version = 2 (ResNet v2)
version = 1
# Computed depth from supplied model parameter n
if version == 1:
    depth = n * 6 + 2
elif version == 2:
    depth = n * 9 + 2
# Model name, depth and version
model_type = 'ResNet%dv%d' % (depth, version)


In [10]:
def lr_schedule(epoch):
    lr = 1e-3
    if epoch > 180:
        lr *= 0.5e-3
    elif epoch > 160:
        lr *= 1e-3
    elif epoch > 120:
        lr *= 1e-2
    elif epoch > 80:
        lr *= 1e-1
    #print('Learning rate: ', lr)
    return lr


def resnet_layer(inputs,
                 num_filters=16,
                 kernel_size=3,
                 strides=1,
                 activation='relu',
                 batch_normalization=True,
                 conv_first=True):

    conv = Conv2D(num_filters,
                  kernel_size=kernel_size,
                  strides=strides,
                  padding='same',
                  kernel_initializer='he_normal',
                  kernel_regularizer=l2(1e-4))

    x = inputs
    if conv_first:
        x = conv(x)
        if batch_normalization:
            x = BatchNormalization()(x)
        if activation is not None:
            x = Activation(activation)(x)
    else:
        if batch_normalization:
            x = BatchNormalization()(x)
        if activation is not None:
            x = Activation(activation)(x)
        x = conv(x)
    return x


def resnet_v1(input_shape, depth, num_classes=5):
    if (depth - 2) % 6 != 0:
        raise ValueError('depth should be 6n+2 (eg 20, 32, 44 in [a])')
    # Start model definition.
    num_filters = 16
    num_res_blocks = int((depth - 2) / 6)

    inputs = Input(shape=input_shape)
    x = resnet_layer(inputs=inputs)
    # Instantiate the stack of residual units
    for stack in range(3):
        for res_block in range(num_res_blocks):
            strides = 1
            if stack > 0 and res_block == 0:  # first layer but not first stack
                strides = 2  # downsample
            y = resnet_layer(inputs=x,
                             num_filters=num_filters,
                             strides=strides)
            y = resnet_layer(inputs=y,
                             num_filters=num_filters,
                             activation=None)
            if stack > 0 and res_block == 0:  # first layer but not first stack
                # linear projection residual shortcut connection to match
                # changed dims
                x = resnet_layer(inputs=x,
                                 num_filters=num_filters,
                                 kernel_size=1,
                                 strides=strides,
                                 activation=None,
                                 batch_normalization=False)
            x = keras.layers.add([x, y])
            x = Activation('relu')(x)
        num_filters *= 2

    # Add classifier on top.
    # v1 does not use BN after last shortcut connection-ReLU
    x = AveragePooling2D(pool_size=1)(x)
    y = Flatten()(x)
    outputs = Dense(num_classes,
                    activation='softmax',
                    kernel_initializer='he_normal')(y)

    # Instantiate model.
    model = Model(inputs=inputs, outputs=outputs)
    return model

In [11]:
input_shape = (x_train.shape[1], 1, 1)
print(input_shape)

(200, 1, 1)


In [12]:
if version == 2:
    #model = resnet_v2(input_shape=input_shape, depth=depth)
    pass
else:
    model = resnet_v1(input_shape=input_shape, depth=depth)

model.compile(loss='categorical_crossentropy',
              optimizer = Adam(lr=lr_schedule(0)),
              metrics=['accuracy'])
#model.summary()
print(model_type)

# Prepare model model saving directory.
save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name = 'cifar100_%s_model.{epoch:03d}.h5' % model_type
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
filepath = os.path.join(save_dir, model_name)

# Prepare callbacks for model saving and for learning rate adjustment.
checkpoint = ModelCheckpoint(filepath=filepath,
                             monitor='val_acc',
                             verbose=0,
                             save_best_only=True)

lr_scheduler = LearningRateScheduler(lr_schedule)

lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1),
                               cooldown=0,
                               patience=5,
                               min_lr=0.5e-6)

callbacks = [checkpoint, lr_reducer, lr_scheduler] # [checkpoint] #

ResNet20v1


In [13]:
def get_losses(x_in, y_in, model):
    loss_arr = []
    for i in range(len(x_in)):
        loss_arr.append(model.evaluate(x_in[i:i+1],
                                       y_in[i:i+1],
                                       batch_size=None,
                                       verbose=0,
                                       steps=1)[0])
    return loss_arr

In [14]:
model.fit(x_train, y_train, epochs=10, shuffle=False, batch_size=batch_size, verbose=2)

Epoch 1/10
235/235 - 53s - loss: 0.9363 - accuracy: 0.5943
Epoch 2/10
235/235 - 53s - loss: 0.7119 - accuracy: 0.6547
Epoch 3/10
235/235 - 53s - loss: 0.6576 - accuracy: 0.6871
Epoch 4/10
235/235 - 53s - loss: 0.6159 - accuracy: 0.7180
Epoch 5/10
235/235 - 53s - loss: 0.5933 - accuracy: 0.7337
Epoch 6/10
235/235 - 54s - loss: 0.5703 - accuracy: 0.7498
Epoch 7/10
235/235 - 55s - loss: 0.5641 - accuracy: 0.7543
Epoch 8/10
235/235 - 55s - loss: 0.5489 - accuracy: 0.7658
Epoch 9/10
235/235 - 55s - loss: 0.5422 - accuracy: 0.7717
Epoch 10/10
235/235 - 55s - loss: 0.5498 - accuracy: 0.7718


<tensorflow.python.keras.callbacks.History at 0x7f5f18e6b100>

In [15]:
inst_acc = model.evaluate(x_test, y_test)[1]
print("Acc: {}".format(inst_acc))

Acc: 0.5756999850273132
