In [1]:
### This notebook will create a natural language processor that is able 
### to take in task titles and predict the category that they belong to (Work, Home, Shopping)


## Hosted at https://gitlab.com/Task_Management_CompSci/task_classifier

In [21]:
# Import dependencies
!pip install -q pandas tensorflow numpy tensorflow_hub sklearn # quietly install anything we need.

import pandas as pd # use pandas package for file processing. (Reading from todos.csv)
import tensorflow as tf # use tensorflow to create the model itself.
import numpy as np # use numpy for calculations and array handling 
import tensorflow_hub as hub # use tensorflow hub to get hold of pretrained model
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.logging.set_verbosity(tf.logging.ERROR)

In [22]:
# set variable data_file to string with name of file with dataset store.
data_file = "todos.csv" 

In [23]:
df = pd.read_csv(data_file) # create a dataframe from the dataset.
df.head() 
# check that the dataframe has been generated properly by checking top 5 datapoints.

Unnamed: 0,task,category
0,Do the shopping,S
1,Go to work,W
2,Do the washing,H
3,Get a haircut,S
4,Print agendas,W


In [24]:
train_size = int(len(df) * .8)

train_descriptions = df.task[:train_size].astype("str")

train_categories = df.category[:train_size]

test_descriptions = df.task[train_size:].astype("str")
test_categories = df.category[train_size:]
# Execute the test train split with 4:1 ratio (80% train, 20% test)

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer
# import sklearn preprocessing to create one-hot vectors for training.

In [26]:
encoder = MultiLabelBinarizer()
encoder.fit_transform(train_categories)
train_encoded = encoder.transform(train_categories)
test_encoded = encoder.transform(test_categories)
num_classes = len(encoder.classes_)
# Create one hot vectors from training and test labels.
# Therefore, instead of a title being classified as 'S' (for shopping), it is classified as [0, 1, 0]

num_classes
# check that encoding worked by verifying that there are three values in generated vectors.


3

In [27]:
multi_label_head = tf.contrib.estimator.multi_label_head(
    num_classes,
    loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE
)

# Create a placeholder model to classify tasks. 
# Model will output a one-hot of length 3 after being given an input 
# of unknown size 
# (perfect for text, as size is variable and cannot be modelled by fixed no neurons in Dense NN)


In [28]:
headline_embeddings = hub.text_embedding_column("task", module_spec="http://tfhub.dev/google/universal-sentence-encoder/2", trainable=False)

# Get hold of a pretrained model, which is to be transfered onto the problem at hand.


In [29]:
features = { # Only feature of datapoints is the title of the example. 
  "task": np.array(train_descriptions).astype(np.str) # store this title in a dictionary.
}  
labels = np.array(train_encoded).astype(np.int32) # store labels as a numpy array.
train_input_fn = tf.estimator.inputs.numpy_input_fn(features, labels, shuffle=True, batch_size=1, num_epochs=25)
# Above, create a model to feed inputs into the network, batch size of 1 and over 25 epochs. 
# Shuffle to ensure each epoch is in different order.
estimator = tf.contrib.estimator.DNNEstimator( 
    # Implement the model by creating a dense network for estimation.
    head=multi_label_head,
    hidden_units=[64,10],
    feature_columns=[headline_embeddings], model_dir='models/')



In [30]:
estimator.train(input_fn=train_input_fn) # train the classifier on test and train data.

<tensorflow.contrib.estimator.python.estimator.dnn.DNNEstimator at 0x1c3f108860>

In [31]:
eval_input_fn = tf.estimator.inputs.numpy_input_fn({"task": np.array(test_descriptions).astype(np.str)}, test_encoded.astype(np.int32), shuffle=False)
estimator.evaluate(input_fn=eval_input_fn)
# Evaluate the model by testing with test data

{'auc': 0.9999999,
 'auc_precision_recall': 0.9999998,
 'average_loss': 0.021712666,
 'loss': 0.021712666,
 'global_step': 1425}

In [32]:
data = ["Computer Science homework"]
# Feeding in example of 'Computer Science homework', expect this classified as work.
predict_input_fn = tf.estimator.inputs.numpy_input_fn({"task": np.array(data).astype(np.str)}, shuffle=False)
encoder.classes_

array(['H', 'S', 'W'], dtype=object)

In [33]:
results = estimator.predict(predict_input_fn) # predict the example above.

In [34]:
results
category = ["home", "shopping", "work"] # get the result of the prediction

In [35]:
for i in results:
    print(category[np.argmax(i["probabilities"])]) # show result in text as eithe work, shopping or home

work


In [None]:
# Should predict that the data 'computer science homework' should be categorised as work.