# Build CNN to do training and testing

In [24]:
from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from cnn import mini_XCEPTION
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import re

In [61]:
from keras.models import load_model

In [25]:
#from dataset import read_data

In [26]:
data_path = '../../dataset/Tweets-airline-sentiment.csv'

In [27]:
def read_data(data_path, feature = 'Unigram', max_feature_num = 500):
#feature: the text feature, could be 'Unigram, Bigram, Trigram or Mixing Unigram with Bigram'
	data = pd.read_csv(data_path)
	text = data['text']
	label = data['airline_sentiment']
	label_tags = label.unique()
	#replace text label with one-hot-labels
	new_label= []
	for l in label:
		if l == label_tags[0]:
			new_label.append(np.array([0,0,1]))
		elif l == label_tags[1]:
			new_label.append(np.array([0,1,0]))
		else:
			new_label.append(np.array([1,0,0]))
	#get rid of '@airline_company_name
	new_text = []
	for t in text:
		new_text.append(re.sub('^@\\w+ *','', t))
	if feature == 'Unigram':
		Vec = CountVectorizer(max_features = max_feature_num, ngram_range=(1,1))
		out = Vec.fit_transform(new_text)
	elif feature == 'Bigram':
		Vec = CountVectorizer(max_features = max_feature_num, ngram_range=(2,2))
		out = Vec.fit_transform(new_text)
	elif feature == 'Trigram':
		Vec = CountVectorizer(max_features= max_feature_num, ngram_range=(3,3))
		out = Vec.fit_transform(new_text)
	else:
	# mix bigram and unigram
		Vec = CountVectorizer(max_features = max_feature_num, ngram_range = (1,2))
		out = Vec.fit_transform(new_text)
	new_label = np.asarray(new_label)
	return out, new_label

In [28]:
text, label = read_data(data_path = data_path, feature='Unigram', max_feature_num=500)
#text, label = np.asarray(text), np.asarray(label)

In [29]:
text = np.asarray(text.todense())

In [39]:
newt = []
height, width = 10,50

In [40]:
for t in text:
    newt.append(t.reshape((height, width)))

In [41]:
text = np.asarray(newt)

In [45]:
text.shape[1:]

(10, 50)

In [46]:
batch_size = 32
num_epochs = 1000
input_shape = (text.shape[1],text.shape[2],1)
verbose = 1
number_classes = 3
patience = 50
# data generator
data_generator = ImageDataGenerator(
                        featurewise_center=False,
                        featurewise_std_normalization=False,
                        rotation_range=10,
                        width_shift_range=0.1,
                        height_shift_range=0.1,
                        zoom_range=.1,
                        horizontal_flip=True,
                        )

In [47]:
model = mini_XCEPTION(input_shape=input_shape, num_classes=3)

In [48]:
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 10, 50, 1)    0                                            
__________________________________________________________________________________________________
conv2d_9 (Conv2D)               (None, 8, 48, 8)     72          input_4[0][0]                    
__________________________________________________________________________________________________
batch_normalization_15 (BatchNo (None, 8, 48, 8)     32          conv2d_9[0][0]                   
__________________________________________________________________________________________________
activation_7 (Activation)       (None, 8, 48, 8)     0           batch_normalization_15[0][0]     
__________________________________________________________________________________________________
conv2d_10 

In [49]:
# callbacks
dataset_name = 'USAirline'
log_file_path = dataset_name + '_sentiment_training.log'
csv_logger = CSVLogger(log_file_path, append=False)
early_stop = EarlyStopping('val_loss', patience=patience)
reduce_lr = ReduceLROnPlateau('val_loss', factor=0.1,patience=int(patience/4), verbose=1)
trained_models_path = dataset_name + '_mini_XCEPTION'
model_names = trained_models_path + '.{epoch:02d}-{val_acc:.2f}.hdf5'
model_checkpoint = ModelCheckpoint(model_names, 'val_loss', verbose=1,save_best_only=True)
callbacks = [model_checkpoint, csv_logger, early_stop, reduce_lr]

In [50]:
def split_data(text, label, train_test_split = 0.3,validation_train_split = 0.2):
    #number of each dataset
    arr_num = label.shape[0]
    test_num = int(arr_num*train_test_split)
    train_num = int((arr_num-test_num)*(1-validation_train_split))
    val_num = arr_num - test_num-train_num
    #new permulated dataset
    permutation_arr = np.random.permutation(arr_num)
    #new_text = np.expand_dims(text,-1)
    new_text, new_label = text[permutation_arr], label[permutation_arr]
    X_train, X_val, X_test = new_text[:train_num], new_text[train_num: (train_num+val_num)], new_text[(train_num+val_num):]
    y_train, y_val, y_test = new_label[:train_num], new_label[train_num: (train_num+val_num)], new_label[(train_num+val_num):]
    return X_train, X_val, X_test, y_train, y_val, y_test

In [51]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(text, label)

In [52]:
X_train, X_val, X_test = np.expand_dims(X_train,-1),np.expand_dims(X_val,-1),np.expand_dims(X_test,-1)

In [53]:
X_train.shape

(8198, 10, 50, 1)

In [54]:
train_data = (X_train,y_train)
val_data = (X_val, y_val)
test_data = (X_test, y_test)

In [55]:
X_train.shape

(8198, 10, 50, 1)

In [56]:
data_generator.flow(X_train,y_train,batch_size = 32)

<keras.preprocessing.image.NumpyArrayIterator at 0x7f8dbaf5f5f8>

In [57]:
#traing
model.fit_generator(data_generator.flow(X_train,y_train,batch_size = 32),
                        steps_per_epoch=len(X_train) / batch_size,
                        epochs=num_epochs, verbose=1, callbacks=callbacks,
                        validation_data=val_data)

Epoch 1/1000

Epoch 00001: val_loss improved from inf to 0.99137, saving model to USAirline_mini_XCEPTION.01-0.63.hdf5
Epoch 2/1000

Epoch 00002: val_loss improved from 0.99137 to 0.95593, saving model to USAirline_mini_XCEPTION.02-0.64.hdf5
Epoch 3/1000

Epoch 00003: val_loss improved from 0.95593 to 0.87193, saving model to USAirline_mini_XCEPTION.03-0.65.hdf5
Epoch 4/1000

Epoch 00004: val_loss did not improve from 0.87193
Epoch 5/1000

Epoch 00005: val_loss did not improve from 0.87193
Epoch 6/1000

Epoch 00006: val_loss did not improve from 0.87193
Epoch 7/1000

Epoch 00007: val_loss improved from 0.87193 to 0.85593, saving model to USAirline_mini_XCEPTION.07-0.65.hdf5
Epoch 8/1000

Epoch 00008: val_loss improved from 0.85593 to 0.82530, saving model to USAirline_mini_XCEPTION.08-0.65.hdf5
Epoch 9/1000

Epoch 00009: val_loss did not improve from 0.82530
Epoch 10/1000

Epoch 00010: val_loss did not improve from 0.82530
Epoch 11/1000

Epoch 00011: val_loss improved from 0.82530 to 0


Epoch 00041: val_loss did not improve from 0.80865
Epoch 42/1000

Epoch 00042: val_loss improved from 0.80865 to 0.80132, saving model to USAirline_mini_XCEPTION.42-0.66.hdf5
Epoch 43/1000

Epoch 00043: val_loss did not improve from 0.80132
Epoch 44/1000

Epoch 00044: val_loss did not improve from 0.80132
Epoch 45/1000

Epoch 00045: val_loss did not improve from 0.80132
Epoch 46/1000

Epoch 00046: val_loss did not improve from 0.80132
Epoch 47/1000

Epoch 00047: val_loss did not improve from 0.80132
Epoch 48/1000

Epoch 00048: val_loss did not improve from 0.80132
Epoch 49/1000

Epoch 00049: val_loss did not improve from 0.80132
Epoch 50/1000

Epoch 00050: val_loss did not improve from 0.80132
Epoch 51/1000

Epoch 00051: val_loss did not improve from 0.80132
Epoch 52/1000

Epoch 00052: val_loss did not improve from 0.80132
Epoch 53/1000

Epoch 00053: val_loss did not improve from 0.80132
Epoch 54/1000

Epoch 00054: val_loss did not improve from 0.80132

Epoch 00054: ReduceLROnPlateau 


Epoch 00082: val_loss did not improve from 0.80132
Epoch 83/1000

Epoch 00083: val_loss did not improve from 0.80132
Epoch 84/1000

Epoch 00084: val_loss did not improve from 0.80132
Epoch 85/1000

Epoch 00085: val_loss did not improve from 0.80132
Epoch 86/1000

Epoch 00086: val_loss did not improve from 0.80132
Epoch 87/1000

Epoch 00087: val_loss did not improve from 0.80132
Epoch 88/1000

Epoch 00088: val_loss did not improve from 0.80132
Epoch 89/1000

Epoch 00089: val_loss did not improve from 0.80132
Epoch 90/1000

Epoch 00090: val_loss did not improve from 0.80132

Epoch 00090: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 91/1000

Epoch 00091: val_loss did not improve from 0.80132
Epoch 92/1000

Epoch 00092: val_loss did not improve from 0.80132


<keras.callbacks.History at 0x7f8dbaf3e438>

In [77]:
pred_label =[]

In [74]:
model_path = './USAirline_mini_XCEPTION.31-0.67.hdf5'
classifier = load_model(emotion_model_path, compile=False)
# getting input model shapes for inference
target_size = classifier.input_shape[1:3]


In [78]:
for gray_image in X_test:
    #gray_image = np.squeeze(gray_image)
    #gray_image = preprocess_input(gray_image, True)
    gray_image = np.expand_dims(gray_image, 0)
    #gray_image = np.expand_dims(gray_image, -1)
    label = np.argmax(classifier.predict(gray_image))
    pred_label.append(label)

In [80]:
#pred_label

In [82]:
np.argmax(y_test, axis=1)

array([0, 0, 2, ..., 1, 0, 1])

In [83]:
acc = 0
for pred,true_label in zip(pred_label, np.argmax(y_test,axis=1)):
    if pred == true_label:
        acc += 1 
    else:
        continue
print(acc/len(y_test))
        
                           
    

0.6568761384335154
