In [0]:
# Initialize drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Move to drive
%cd 'drive/My Drive/Thesis/Data Experimentation'

In [2]:
# Do imports
! pip install keras-metrics
from keras_metrics import categorical_precision, categorical_recall, categorical_f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout
import re
import numpy as np
import pandas as pd
import keras.backend as K
from keras import regularizers
from keras.callbacks import EarlyStopping
from matplotlib.pyplot import hist
import pickle

Collecting keras-metrics
  Downloading https://files.pythonhosted.org/packages/32/c9/a87420da8e73de944e63a8e9cdcfb1f03ca31a7c4cdcdbd45d2cdf13275a/keras_metrics-1.1.0-py2.py3-none-any.whl
Installing collected packages: keras-metrics
Successfully installed keras-metrics-1.1.0


Using TensorFlow backend.


In [0]:
# Get relevant data for this task
def get_data(y_labels, plot_hist=True, raw=False, categorical=True):
  df = pd.read_csv('./data/income_data.csv', index_col=0)
  df = df.dropna()  # Drop rows with missing data
  X = df['str']
  Y = df[y_labels]
  
  if categorical:
    Y_list = []
    for i, labels in Y.iterrows():
      y_l = list(labels)
      if raw:
        Y_list.append(y_l)
      else:
        Y_list.append(y_l.index(max(y_l)))
  else:
    Y_list = Y

  # Get histogram of Y_list
  if plot_hist:
    hist(Y_list)

  return X, Y_list

In [0]:
# Preprocess strings
np.random.seed(31415)

# Example is for politics

# Read data
X, Y = get_data(['no_political', 'conservative', 'independent', 'liberal'],
                    categorical=False, plot_hist=False)

In [0]:
# Transform to one-hot representation
Y_l = []
for _, d in Y.iterrows():
  d = list(d)
  pol = sum(d[1:-1])
  if pol >= d[0]:
    Y_l.append([0, 1])
  else:
    Y_l.append([1, 0])
Y_l = np.asarray(Y_l)
print(sum(Y_l))  # Gets the number in each class

# Make X TF-IDF
vec = TfidfVectorizer(stop_words='english', binary=True)
vec.fit(X)
X_tf = vec.transform(X).toarray()

In [0]:
# Get train/test split, train on 90%
x_train, x_test, y_train, y_test = train_test_split(X_tf, Y_l, test_size=0.1, 
                                                    shuffle=True)

In [0]:
# Build simple feed-forward neural network
size = len(x_train[0])

DR_LIST = [0.0, 0.2, 0.3, 0.4, 0.5]
B_S_LIST = [32, 64, 128]
UNITS = [64, 128, 256, 512]
weights = {0:1, 1:1}
classes = [0, 1]

def get_hp(DR_LIST, B_S_LIST, UNITS, e=EPOCHS):
  for d in DR_LIST:
    for b in B_S_LIST:
      for u in UNITS:
        for deep in [True, False]:
          model = Sequential()
          model.add(Dense(input_dim=size, units=u, activation='relu', 
                          # kernel_regularizer=regularizers.l1(l)
                          ))
          model.add(Dropout(rate=d))
          if deep:
            model.add(Dense(units=u, activation='relu',
                            # kernel_regularizer=regularizers.l1(l)
                            ))
            model.add(Dropout(rate=d))
          model.add(Dense(units=len(classes), activation='softmax',
                          # kernel_regularizer=regularizers.l1(l)
                          ))
          model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', categorical_f1_score()])

          res = model.fit(x_train, y_train, batch_size=b, epochs=e, 
                          validation_split=0.1,
                          class_weight=weights, verbose=0
                          )
          print("Deep: %d, Dropout: %f, Batch Size: %d, Units: %d, Val Accuracy: %f Val F1 %f" % 
                (deep, d, b, u, res.history['val_acc'][-1], res.history['val_f1_score'][-1]))
      

def train_and_save_final(d, b, u, filename, deep=True, save=True):
        model = Sequential()
        model.add(Dense(input_dim=size, units=u, activation='relu'))
        model.add(Dropout(rate=d))
        if deep:
          model.add(Dense(units=u, activation='relu',
                          ))
          model.add(Dropout(rate=d))
        model.add(Dense(units=len(classes), activation='softmax'))
        # Cannot save model with the keras-metrics metrics active
        if not save:
          model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'
                                                                                    ,
                                                                                    categorical_precision(), 
                                                                                    categorical_recall(), 
                                                                                    categorical_f1_score()
                                                                                   ])
        else:
          model.compile(loss='categorical_crossentropy', optimizer='adam', 
                        metrics=['accuracy'])

        res = model.fit(x_train, y_train, batch_size=b, epochs=EPOCHS, 
                        class_weight=weights, verbose=1
                        )
        score = model.evaluate(x_test, y_test, batch_size=b)
        print(score)
        if save:
          model.save(filename)
        return model

def validate(d, b, u, e=EPOCHS, deep=False, l=0.0001):
        model = Sequential()
        model.add(Dense(input_dim=size, units=u, activation='relu', 
                        # kernel_regularizer=regularizers.l1(l)
                        ))
        model.add(Dropout(rate=d))
        if deep:
          model.add(Dense(units=u, activation='relu',
                          # kernel_regularizer=regularizers.l1(l)
                          ))
          model.add(Dropout(rate=d))
        model.add(Dense(units=len(classes), activation='softmax',
                        # kernel_regularizer=regularizers.l1(l)
                        ))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 
                                                                                  categorical_precision(), 
                                                                                  categorical_recall(), 
                                                                                  categorical_f1_score()
                                                                                  ])

        res = model.fit(x_train, y_train, batch_size=b, epochs=e, 
                        validation_split=0.1,
                        class_weight=weights, verbose=1
                        )
        return model

In [0]:
get_hp(DR_LIST, B_S_LIST, UNITS)
model = train_and_save_final(d=0.3, b=32, u=128, deep=True, filename='./models/pol.h5', save=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[2.79722146537722, 0.5260115607510643]


In [0]:
from keras.models import load_model
model = load_model('./models/pol.h5')  # Make sure we can load the model
print(model.evaluate(x_test, y_test))

# Ensure that it is not always predicting one class
pred = model.predict(x_test)
p = []
for labels in pred:
  y_l = list(labels)
  p.append(y_l.index(max(y_l)))

r = [0, 0]
for t in p:
  r[t] += 1

print(r)  # Number of predictions for each class
print(sum(y_test))  # Number of each class in test set 

[2.79722146537722, 0.5260115607510643]
[155, 364]
[247 272]


In [0]:
# An abandoned attempt to use focal loss to compensate for class imbalance

# # Implementation of focal loss - implemented in https://github.com/umbertogriffo/focal-loss-keras/blob/master/losses.py
# def focal_loss(gamma=2, alpha=.25):
#   def focal_loss_fixed(y_true, y_pred):

#     # Scale predictions so that the class probas of each sample sum to 1
#     y_pred /= K.sum(y_pred, axis=-1, keepdims=True)

#     # Clip the prediction value to prevent NaN's and Inf's
#     epsilon = K.epsilon()
#     y_pred = K.clip(y_pred, epsilon, 1. - epsilon)

#     # Calculate Cross Entropy
#     cross_entropy = -y_true * K.log(y_pred)

#     # Calculate Focal Loss
#     loss = alpha * K.pow(1 - y_pred, gamma) * cross_entropy

#     # Sum the losses in mini_batch
#     return K.sum(loss, axis=1)

#   return focal_loss_fixed

In [0]:
# # Functions used to generate ./data/income_data.csv
# # Read dictionary from file `filename` and return `index_to_word` and
# # `word_to_index` dictionaries.
# # Default `filename` is for the income dictionary.
# def get_dict(filename='./income_dataset/dictionary.txt'):
#   index_to_word = {}
#   word_to_index = {}
  
#   with open(filename, 'r') as file:
#     for row in file:
#       index, word = tuple(row.split(' '))
#       index = int(index)
#       word = word.strip()
#       index_to_word[index] = word
#       word_to_index[word] = index
      
#   # Add UNK for unknown values
#   index_to_word[0] = 'UNK'
#   word_to_index['UNK'] = 0
      
#   return index_to_word, word_to_index

# # Convert unigram frequency count for one user provided by `data` to a vector.
# # Reserves location 0 for out of vocabulary words.
# # If `output_type` is `raw`, the raw unigram frequencies are returned.
# # If `output_type` is `binary`, 1 is returned if the unigram appears and 0 otherwise.
# # `vec_size` is the size of the dictionary + 1. By default it is 71556, the size of
# # `dictionary.txt` for the income data.
# def get_vec(data, output_type='raw', vec_size=71556):
#   # Check that output_type is valid
#   if output_type not in ['raw', 'binary']:
#     raise Exception('Invalid vector type supplied. Choose raw or binary.')
      
#   # Get scores
#   scores = np.zeros(vec_size, dtype=int)
#   data_list = data.split(' ')
  
#   for row in data_list:
#     index, count = row.split(':')
#     if output_type == 'raw':
#       scores[int(index)] = int(count)
#     elif output_type == 'binary':
#       scores[int(index)] = 1
   
#   return scores

# # Read from file `filename` the user_id and unigram data. Return a list of
# # tuples, where each tuple has the user_id and a unigram vector.
# # Default `filename` is the file location for the income data.
# def get_id_uni(filename='./income_dataset/jobs-unigrams.txt', output_type='raw', 
#                vec_size=71556):
  
#   id_list = []
#   with open(filename, 'r') as file:
#     for row in file:
#       # Get user_id
#       user_id = re.findall('^\d+ ', row)
#       if len(user_id) == 1:      
#         # Get vector
#         user_id = int(user_id[0].strip())
#         data = re.sub('^\d+ ', '', row)
#         vec = get_vec(data, output_type=output_type, vec_size=vec_size)
#         id_list.append((user_id, vec))
      
#       # Two appear to be missing this data
#       elif len(user_id) == 0:
#         continue
#       # If we can't parse properly, throw an error
#       else:
#         raise Exception("Unable to read user_id")
  
#   return id_list


# # Associate each unigram in the list with the label specified by `label`. Return
# # a list with all of the unigrams and a list with all of the labels.
# # Automatically reads data from ./income_labels.csv.
# def match_label(data, label, labels_file='./income_labels.csv'):
#   labels = pd.read_csv('./income_labels.csv', index_col=0)
#   X = []
#   Y = []
  
#   for row in data:
#     u_id, uni = row
#     X.append(uni)
#     Y.append(labels[label].loc[u_id])
    
#   return X, Y


# # Convert from counts to a string
# def to_string(x, index_to_word):
#   sen = ""
#   for ind, val in enumerate(x.tolist()):
#     if val > 0:
#       word = i_2_w[ind]
#       sen += ' '.join([word]*val) + ' '
#   return sen.strip()

# # Add strings representations to ./income_labels.csv
# def add_strings(raw, binary, i_2_w):
#   l = pd.read_csv('./income_labels.csv')
#   l.rename({'Unnamed: 0': "u_id"})
#   raw_counts = [None]*len(l)
#   string = [None]*len(l)
#   binary_counts =[None]*len(l)
#   for row in raw:
#     u_id, uni = row
#     ind = l.index[l['u_id'] == u_id][0]
#     raw_counts[ind] = uni
#     string[ind] = to_string(uni, i_2_w)
#   for row in binary:
#     u_id, uni = row
#     ind = l.index[l['u_id'] == u_id][0]
#     binary_counts[ind] = uni

#   l['raw'] = raw_counts
#   l['str'] = string
#   l['bin'] = binary_counts

  
#   l.to_csv('./income_data.csv')