## 1. Neural Network Classifier with Scikit

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from reader import PickledReviewsReader
from transformer import TextNormalizer, KeyphraseExtractor

import time
import numpy as np
from functools import wraps

from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score

In [None]:
cpath = 'categorized-comments.jsonl'
mpath = 'ann_cls.pkl'

In [None]:
def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        return result, time.time() - start
    return wrapper

def documents(corpus):
    return list(corpus.reviews())

def continuous(corpus):
    return list(corpus.scores())

def make_categorical(corpus):
    """
    terrible : 0.0 < y <= 3.0
    okay     : 3.0 < y <= 5.0
    great    : 5.0 < y <= 7.0
    amazing  : 7.0 < y <= 10.1
    :param corpus:
    :return:
    """
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.0, 7.0, 10.1])

@timeit
def train_model(path, model, continuous=True, saveto=None, cv=12):
    """
    Trains model from corpus at specified path; constructing cross-validation
    scores using the cv parameter, then fitting the model on the full data and
    writing it to disk at the saveto path if specified. Returns the scores.
    """
    # Load the corpus data and labels for classification
    corpus = PickledReviewsReader(path)
    X = documents(corpus)
    if continuous:
        y = continuous(corpus)
        scoring = 'r2_score'
    else:
        y = make_categorical(corpus)
        scoring = 'f1_score'

    # Compute cross validation scores
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)

    # Fit the model on entire data set
    model.fit(X, y)

    # Write to disk if specified
    if saveto:
        joblib.dump(model, saveto)

    # Return scores as well as training time via decorator
    return scores

In [None]:
pipeline = Pipeline([
        ('norm', TextNormalizer()), # can use KeyphraseExtractor() instead
        ('tfidf', TfidfVectorizer()),
        ('ann', MLPClassifier(hidden_layer_sizes=[500,150], verbose=True))
    ])

In [None]:
 scores, delta = train_model(cpath, pipeline, continuous=False, saveto=mpath)

In [None]:
for idx, score in enumerate(scores):
        print("Accuracy on slice #{}: {}.".format((idx+1), score))
    print("Total fit time: {:0.2f} seconds".format(delta))
    print("Model saved to {}.".format(mpath))

## Neural Network Classifier with Keras

In [None]:
import os
import time
import numpy as np

from functools import wraps

from sklearn.externals import joblib
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import cross_val_score

from keras.layers.embeddings import Embedding
from keras.models import load_model, Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Dropout, Activation, LSTM

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from reader import PickledReviewsReader
from am_reader import PickledAmazonReviewsReader
from transformer import TextNormalizer, GensimDoc2Vectorizer
from transformer import KeyphraseExtractor, GensimTfidfVectorizer


In [None]:
N_FEATURES = 10000
DOC_LEN = 60
N_CLASSES = 2

In [None]:
def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        return result, time.time() - start
    return wrapper

def documents(corpus):
    return list(corpus.reviews())

def continuous(corpus):
    return list(corpus.scores())

def make_categorical(corpus):
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.0, 7.0, 10.1])

def binarize(corpus):
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.1])

def build_nn():
    """
    Create a function that returns a compiled neural network
    :return: compiled Keras neural network model
    """
    nn = Sequential()
    nn.add(Dense(500, activation='relu', input_shape=(N_FEATURES,)))
    nn.add(Dense(150, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    nn.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return nn

def build_lstm():
    lstm = Sequential()
    lstm.add(Embedding(N_FEATURES+1, 128, input_length=DOC_LEN))
    lstm.add(Dropout(0.4))
    lstm.add(LSTM(units=200, recurrent_dropout=0.2, dropout=0.2))
    lstm.add(Dropout(0.2))
    lstm.add(Dense(N_CLASSES, activation='sigmoid'))
    lstm.compile(
        loss='categorical_crossentropy', # b/c target vals are 1 or 2
        optimizer='adam',
        metrics=['accuracy']
    )
    return lstm

@timeit
def train_model(path, model, reader, saveto=None, cv=12, **kwargs):
    """
    Trains model from corpus at specified path;
    fitting the model on the full data and
    writing it to disk at the saveto directory if specified.
    Returns the scores.
    """
    # Load the corpus data and labels for classification
    # corpus = PickledReviewsReader(path) # for Pitchfork
    corpus = PickledAmazonReviewsReader(path)
    X = documents(corpus)
    # y = categorical(corpus) # for Pitchfork
    y = binarize(corpus)

    # Compute cross validation scores
    # mp note: http://scikit-learn.org/stable/faq.html#why-do-i-sometime-get-a-crash-freeze-with-n-jobs-1-under-osx-or-linux
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')

    # Fit the model on entire data set
    model.fit(X, y)

    # Write to disk if specified
    if saveto:
        # have to save the keras part using keras' save method
        model.steps[-1][1].model.save(saveto['keras_model'])
        model.steps.pop(-1)
        # ... and use joblib to save the rest of the pipeline
        joblib.dump(model, saveto['sklearn_pipe'])

    # Return scores as well as training time via decorator
    return scores

In [None]:
 cpath = 'categorized-comments.jsonl'
    mpath = {
        'keras_model'  : 'ktf/keras_nn.h5',
        'sklearn_pipe' : 'ktf/pipeline.pkl'
    }

In [None]:
pipeline = Pipeline([
        ('norm', TextNormalizer()),
        ('vect', TfidfVectorizer(max_features=N_FEATURES)), # need to control feature count
        # ('vect', GensimDoc2Vectorizer(size=N_FEATURES)), # need to control feature count
        ('nn', KerasClassifier(build_fn=build_nn, # pass but don't call the function!
                               epochs=200,
                               batch_size=128))
    ])

##  3. Classifying Images

In [2]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
K.set_image_data_format("channels_first")

In [None]:
np.random.seed(0)

In [None]:
channels = 1
height = 28
width = 28

In [None]:
(data_train, target_train), (data_test, target_test) = mnist.load_data()

In [None]:
data_train = data_train.reshape(data_train.shape[0], channels, height, width)

In [None]:
features_train = data_train / 255
features_test = data_test / 255

In [None]:
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]

In [None]:
network = Sequential()
network.add(Conv2D(filters=64,
                   kernel_size=(5, 5),
                   input_shape=(channels, width, height),
                   activation='relu'))

In [None]:
network.add(MaxPooling2D(pool_size=(2, 2)))

In [None]:
network.add(Dropout(0.5))

### network.add(Flatten())

In [None]:
network.add(Dense(128, activation="relu"))

In [None]:
network.add(Dropout(0.5))

In [None]:
network.add(Dense(number_of_classes, activation="softmax"))


In [None]:
network.compile(loss="categorical_crossentropy", 
                optimizer="rmsprop", 
                metrics=["accuracy"])

In [None]:
network.fit(features_train,
            target_train, # Target
            epochs=2, 
            verbose=0, 
            batch_size=1000, 
            validation_data=(features_test, target_test)) 

## Conclusion

#### 
1. Used scikit-learn to fit neural network classifier
2. Used Keras to fit neural network classifier
3. Used a convolutional neural network to classify the images