In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Activation, Dense, Dropout, Flatten
from keras import backend as K

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight
from sklearn.utils import resample

Using TensorFlow backend.


In [2]:
from google.colab import files
uploaded = files.upload()

Saving data_classification_85.csv to data_classification_85.csv


In [0]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [0]:
data = pd.read_csv('data_classification_85.csv')

In [4]:
data['age_range'].value_counts()

1    67648
2    61529
3    27304
4     9490
5     6866
Name: age_range, dtype: int64

In [0]:
X_dws = data[data.columns.difference(['age','ifa','age_range'])]
Y_dws = data.loc[:, ('age_range')]

In [7]:
uploaded = files.upload()

Saving features_classification.csv to features_classification.csv


In [0]:
features =pd.read_csv('features_classification.csv')

In [0]:
X_dws_train, X_dws_test, Y_dws_train, Y_dws_test = train_test_split(X_dws[features['column']], Y_dws,
                                                          stratify=Y_dws, test_size=0.1)

In [10]:
input_len = len(list(X_dws_train.columns))
print(input_len)

65


In [0]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y_dws)
y_train = encoder.transform(Y_dws_train)
y_test = encoder.transform(Y_dws_test)

# convert integers to dummy variables (i.e. one hot encoded)
encoded_y_train = np_utils.to_categorical(y_train)
encoded_y_test = np_utils.to_categorical(y_test)

In [0]:
y_all = encoder.transform(Y_dws)
encoded_y_all = np_utils.to_categorical(y_all)

In [13]:
np.unique(Y_dws_train)

array([1, 2, 3, 4, 5])

In [14]:
output_len = encoded_y_train.shape[1]
print(output_len)

5


In [15]:
class_weight_values = class_weight.compute_class_weight('balanced',
                                                 np.unique(Y_dws_train),
                                                 Y_dws_train)
print(class_weight_values)

[0.51098993 0.56180656 1.26599658 3.64250088 5.03489238]


In [0]:
def precision(y_true, y_pred):
    # Calculates the precision
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    # Calculates the recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def fbeta_score(y_true, y_pred, beta=1):
    # Calculates the F score, the weighted harmonic mean of precision and recall.

    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')
        
    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score

def fmeasure(y_true, y_pred):
    # Calculates the f-measure, the harmonic mean of precision and recall.
    return fbeta_score(y_true, y_pred, beta=1)

In [17]:
batch_size = 32
epochs = 100

model = Sequential()
# input/char embedding layer

model = Sequential()
model.add(Dense(80, input_dim=input_len, activation='relu'))
model.add(Dense(60, activation='relu'))
model.add(Dense(50, activation='tanh'))
model.add(Dense(40, activation='tanh'))
model.add(Dense(20, activation='sigmoid'))
model.add(Dense(output_len, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',fmeasure,recall,precision])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 80)                5280      
_________________________________________________________________
dense_2 (Dense)              (None, 60)                4860      
_________________________________________________________________
dense_3 (Dense)              (None, 50)                3050      
_________________________________________________________________
dense_4 (Dense)              (None, 40)                2040      
_________________________________________________________________
dense_5 (Dense)              (None, 20)                820       
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 105       
Total params: 16,155
Trainable params: 16,155
Non-trainable params: 0
_________________________________________________________________


In [None]:
early_stopping = EarlyStopping(patience=20, verbose=1)
checkpointer = ModelCheckpoint(filepath=r'age_classifier_Unbalanced.hdf5', 
                               verbose=1, 
                               save_best_only=True)
model.fit(X_dws_train, encoded_y_train, 
          batch_size=batch_size, 
          epochs = epochs, 
          verbose=1,
          class_weight = class_weight_values,
          shuffle=True,
          validation_split=0.1,
          callbacks=[early_stopping, checkpointer])

In [0]:
from google.colab import files
files.download(r'age_classifier_Unbalanced.hdf5')

In [0]:
model.load_weights(r'age_classifier_Unbalanced.hdf5')

In [0]:
y_pred = model.predict(X_dws_test)

In [22]:
score = model.evaluate(X_dws_test, encoded_y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 1.2890656060075572
Test accuracy: 0.42796806294839157
