# Compare NLP Techniques: Build Models

### Read In Cleaned Text

In [None]:
# Load the cleaned training and test sets
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

X_train.head()

### Create TF-IDF Vectors

In [None]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [None]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

### Fit RandomForestClassifier On Top Of TF-IDF Vectors

In [None]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [None]:
# Use the trained model to make predictions on the test data
y_pred_tfidf = rf_model.predict(X_test_vect)

In [None]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_tfidf, 3), round(recall_tfidf, 3), round((y_pred_tfidf==y_test['label']).sum()/len(y_pred_tfidf), 3)))

### Create word2vec Vectors

In [None]:
# Train a basic word2vec model
import gensim

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [None]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

In [None]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
# What does the unaveraged version look like?
X_train_vect[:1]

In [None]:
# What does the averaged version look like?
X_train_vect_avg[:1]

### Fit RandomForestClassifier On Top Of Word Vectors

In [None]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [None]:
# Use the trained model to make predictions on the test data
y_pred_word2vec = rf_model.predict(X_test_vect_avg)

In [None]:
# Evaluate the predictions of the model on the holdout test set
precision_word2vec = precision_score(y_test, y_pred_word2vec)
recall_word2vec = recall_score(y_test, y_pred_word2vec)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_word2vec, 3), round(recall_word2vec, 3), round((y_pred_word2vec==y_test['label']).sum()/len(y_pred_word2vec), 3)))

### Create doc2vec Vectors

In [None]:
# Created TaggedDocument vectors for each text message in the training and test sets
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(v, [i])
                     for i, v in enumerate(X_train['clean_text'])]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(v, [i])
                    for i, v in enumerate(X_test['clean_text'])]

In [None]:
# What do these TaggedDocument objects look like?
tagged_docs_train[:2]

In [None]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs_train, 
                                  vector_size=100,
                                  window=5,
                                  min_count=2)

In [None]:
# Infer the vectors to be used in training and testing
train_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_train]
test_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_test]

### Fit RandomForestClassifier On Top Of Document Vectors

In [None]:
# Fit a basic model, make predictions on the holdout test set, and the generate the evaluation metrics
rf = RandomForestClassifier()
rf_model = rf.fit(train_vectors, y_train.values.ravel())

y_pred_doc2vec = rf_model.predict(test_vectors)

precision_doc2vec = precision_score(y_test, y_pred_doc2vec)
recall_doc2vec = recall_score(y_test, y_pred_doc2vec)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_doc2vec, 3), round(recall_doc2vec, 3), round((y_pred_doc2vec==y_test['label']).sum()/len(y_pred_doc2vec), 3)))

### Prep Data for RNN

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Train the tokenizer and use that tokenizer to convert the sentences to sequences of numbers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['clean_text'])
X_train_seq = tokenizer.texts_to_sequences(X_train['clean_text'])
X_test_seq = tokenizer.texts_to_sequences(X_test['clean_text'])

In [None]:
# Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq, 50)
X_test_seq_padded = pad_sequences(X_test_seq, 50)

### Build And Evaluate RNN

In [None]:
# Import the tools needed from keras and define functions to calculate recall and precision
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [None]:
# Construct the basic RNN model framework
model = Sequential()

model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(32, dropout=0, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', precision_m, recall_m])

In [None]:
# Fit the RNN
history = model.fit(X_train_seq_padded, y_train['label'], 
                    batch_size=32, epochs=10,
                    validation_data=(X_test_seq_padded, y_test))

In [None]:
# Plot basic evaluation metrics across epochs
import matplotlib.pyplot as plt
%matplotlib inline

for i in ['accuracy', 'precision_m', 'recall_m']:
    acc = history.history[i]
    val_acc = history.history['val_{}'.format(i)]
    epochs = range(1, len(acc) + 1)

    plt.figure()
    plt.plot(epochs, acc, label='Training Accuracy')
    plt.plot(epochs, val_acc, label='Validation Accuracy')
    plt.title('Results for {}'.format(i))
    plt.legend()
    plt.show()
    
precision_rnn = history.history['val_precision_m']
recall_rnn = history.history['val_recall_m']
accuracy_rnn = history.history['val_accuracy']

### Compare Models

In [None]:
print('TFIDF -- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_tfidf, 3), round(recall_tfidf, 3), round((y_pred_tfidf==y_test['label']).sum()/len(y_pred_tfidf), 3)))
print('W2V -- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_word2vec, 3), round(recall_word2vec, 3), round((y_pred_word2vec==y_test['label']).sum()/len(y_pred_word2vec), 3)))
print('D2V -- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_doc2vec, 3), round(recall_doc2vec, 3), round((y_pred_doc2vec==y_test['label']).sum()/len(y_pred_doc2vec), 3)))
print('RNN -- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_rnn[-1], 3), round(recall_rnn[-1], 3), round(accuracy_rnn[-1], 3)))