In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
import pandas as pd
import numpy as np
np.random.seed(0)
from nltk import word_tokenize
from gensim.models import word2vec

In [2]:
# df1 = pd.read_csv('model_data.csv')

In [19]:
df = pd.read_csv('CLEAN_EDIT_model_data.csv')

In [20]:
df.drop(columns = ['Unnamed: 0'], inplace= True)

In [24]:
df.tweet = df.tweet.astype(str)

In [25]:
target = df.signal_x

In [26]:
data = df['tweet'].map(word_tokenize).values

In [37]:
data

array([list(['assuming', 'acceleration', 'of', 'to', 'but', 'in', 'a', 'comfortable', 'direction', 'will', 'feel', 'like', 'a', 'mild', 'to']),
       list(['is', 'capable', 'of', 'transporting', 'satellite', 'to', 'orbit', 'crew', 'and', 'cargo', 'to', 'the', 'and', 'mission', 'to', 'the', 'moon', 'an']),
       list(['yup']), ...,
       list(['these', 'article', 'in', 'space', 'news', 'describe', 'why', 'v', 't', 'and', 't', 'w']),
       list(['wa', 'by', 'a', 'saying', 'rocket', 'ha', 'no', 'chance', 'just', 'said', 'the', 'franco', 'german', 'ha', 'no', 'chance', 'so', 'go', 'with']),
       list(['just', 'returned', 'from', 'a', 'trip', 'to', 'and', 'where', 'i', 'met', 'with', 'many', 'interesting', 'people', 'i', 'really', 'like'])],
      dtype=object)

In [28]:
type(data)

numpy.ndarray

In [29]:
y = pd.get_dummies(target).values

In [30]:
tokenizer = text.Tokenizer(num_words=15000)
tokenizer.fit_on_texts(list(df.tweet))
list_tokenized_tweets = tokenizer.texts_to_sequences(df.tweet)
X_t = sequence.pad_sequences(list_tokenized_tweets, maxlen=100)

In [31]:
embedding_size = 128
input_ = Input(shape=(100,))
x = Embedding(10000, embedding_size)(input_)
x = LSTM(25, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
# There are 41 different possible classes, so we use 41 neurons in our output layer
x = Dense(3, activation='softmax')(x)

model = Model(inputs=input_, outputs=x)

In [32]:
# embedding_size = 128
# input_ = Input(shape=(100,))
# x = Embedding(10000, embedding_size)(input_)
# x = LSTM(25, return_sequences=True)(x)
# x = GlobalMaxPool1D()(x)
# x = Dropout(0.5)(x)
# x = Dense(50, activation='relu')(x)
# x = Dropout(0.5)(x)
# # There are 41 different possible classes, so we use 41 neurons in our output layer
# x = Dense(3, activation='relu')(x)

# model = Model(inputs=input_, outputs=x)

In [33]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 128)          1280000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 25)           15400     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 25)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                1300      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
__________

In [35]:
model.fit(X_t, y, epochs=20, batch_size=32, validation_split=0.1)

Train on 1981 samples, validate on 221 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a3f6c2748>

---

### New model experiments

#### GloVe

In [40]:
total_vocabulary = set(word for tweet in data for word in tweet)
len(total_vocabulary)

3235

In [41]:
glove = {}
with open('glove.6B/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [42]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline. 
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [45]:
rf =  Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ('Logistic Regression', LogisticRegression())])

In [46]:
models = [('Random Forest', rf),
          ("Support Vector Machine", svc),
          ("Logistic Regression", lr)]

In [47]:
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [48]:
scores

[('Random Forest', 0.3592187757795743),
 ('Support Vector Machine', 0.40009074410163337),
 ('Logistic Regression', 0.36284606500577465)]