# Cleaning Tweets - NLP

### Importing Libraries

In [40]:
import numpy as np
import random
import _pickle as pickle
import pandas as pd
from nltk import word_tokenize
from gensim.models import word2vec

### Importing DataFrames

In [178]:
# Loading in the first DF
with open("city1_df.pkl",'rb') as fp:
    city_1 = pickle.load(fp)

# Loading in the second DF
with open("city2_df.pkl",'rb') as fp:
    city_2 = pickle.load(fp)

### Checking the DataFrames

In [35]:
#city_1

Unnamed: 0,cashtags,conversation_id,created_at,date,day,geo,hashtags,hour,id,link,...,search,source,timezone,tweet,user_id,user_id_str,user_rt,user_rt_id,username,City
0,[],1179398288778366977,1570028726000,2019-10-02 11:05:26,3,,"[#pelosi, #schiff, #trumpimpeachmentinquiry]",14,1179412079830327296,https://twitter.com/cynthia_kirk03/status/1179...,...,Trump,,EDT,SCANDAL RANGERS TO THE RESCUE! I'm proud to be...,836002063137951744,836002063137951744,,,cynthia_kirk03,Seattle
1,[],1179138038464540674,1570028726000,2019-10-02 11:05:26,3,,[],14,1179412079066992644,https://twitter.com/k8briz/status/117941207906...,...,Trump,,EDT,Well-what is it?,755855317246672896,755855317246672896,,,k8briz,Seattle
2,[],1179411261337067520,1570028697000,2019-10-02 11:04:57,3,,[],06,1179411954416439296,https://twitter.com/mfow020/status/11794119544...,...,Trump,,EDT,Enemy of the people!!!,987789810570969088,987789810570969088,,,mfow020,Seattle
3,[],1179411261337067520,1570028692000,2019-10-02 11:04:52,3,,[],05,1179411936582209536,https://twitter.com/dougalpollux/status/117941...,...,Trump,,EDT,Top of the morning 😃 Mr President,226099935,226099935,,,dougalpollux,Seattle
4,[],1179196668023054336,1570028671000,2019-10-02 11:04:31,2,,[],23,1179411847642062851,https://twitter.com/Craterdome/status/11794118...,...,Trump,,EDT,Is there any line Trump can cross that would m...,15481999,15481999,,,Craterdome,Seattle
5,[],1179407402933870592,1570028665000,2019-10-02 11:04:25,2,,[],21,1179411820974628871,https://twitter.com/TamaraZRoberson/status/117...,...,Trump,,EDT,So Trump and his supporters threatening to kil...,1028106497237049344,1028106497237049344,,,TamaraZRoberson,Seattle
6,[],1179411261337067520,1570028661000,2019-10-02 11:04:21,2,,[],20,1179411803417264136,https://twitter.com/mfow020/status/11794118034...,...,Trump,,EDT,America needs someone who is that tough!!! Res...,987789810570969088,987789810570969088,,,mfow020,Seattle
7,[],1179411764573872129,1570028651000,2019-10-02 11:04:11,2,,[],17,1179411764573872129,https://twitter.com/ColinDavies1967/status/117...,...,Trump,,EDT,When Trump and his allies finally hit rock bot...,824303270407512064,824303270407512064,,,ColinDavies1967,Seattle
8,[],1179411764074762240,1570028651000,2019-10-02 11:04:11,2,,[#dontheconjr],17,1179411764074762240,https://twitter.com/TheMindOfColi/status/11794...,...,Trump,,EDT,"seriously, i “thought” he was supposed to be t...",975209966,975209966,,,TheMindOfColi,Seattle
9,[],1179411261337067520,1570028644000,2019-10-02 11:04:04,2,,[],15,1179411736170029056,https://twitter.com/Fragola_Girl/status/117941...,...,Trump,,EDT,Nope. Moot. pic.twitter.com/hwbHpQ6OQ0,1284836701,1284836701,,,Fragola_Girl,Seattle


In [179]:
#city_2

## Cleaning and NLP

### Function Balancing the class values

In [180]:
def city_balance(city_1, city_2):
    """
    Balances the number of unique tweets from each city
    Removes brief tweets
    """
    # Removing tweets with less than 10 characters
    city_1 = city_1[city_1.tweet.map(len)>10]
    city_2 = city_2[city_2.tweet.map(len)>10]
    
    # Removing tweets from the same user
    city_1 = city_1.drop_duplicates('user_id')
    city_2 = city_2.drop_duplicates('user_id')

    # Checking for class balance
    dif = abs(len(city_1) - len(city_2))

    # Randomly dropping rows from the DF with a greater number of rows
    if len(city_1) > len(city_2):
        city_1 = city_1.drop(random.sample(list(city_1.index), dif))
    elif len(city_2) > len(city_1):
        city_2 = city_2.drop(random.sample(list(city_2.index), dif))
    else:
        print("DFs are balanced")
    
    print(f"DF Lengths:\nCity 1 = {len(city_1)}\nCity 2 = {len(city_2)}")

    return city_1, city_2

In [181]:
city_1, city_2 = city_balance(city_1, city_2)

DF Lengths:
City 1 = 934
City 2 = 934


In [182]:
# Combining both dataframes
main_df = city_1.append(city_2)
main_df

Unnamed: 0,cashtags,conversation_id,created_at,date,day,geo,hashtags,hour,id,link,...,search,source,timezone,tweet,user_id,user_id_str,user_rt,user_rt_id,username,City
0,[],1179398288778366977,1570028726000,2019-10-02 11:05:26,3,,"[#pelosi, #schiff, #trumpimpeachmentinquiry]",14,1179412079830327296,https://twitter.com/cynthia_kirk03/status/1179...,...,Trump,,EDT,SCANDAL RANGERS TO THE RESCUE! I'm proud to be...,836002063137951744,836002063137951744,,,cynthia_kirk03,Seattle
1,[],1179138038464540674,1570028726000,2019-10-02 11:05:26,3,,[],14,1179412079066992644,https://twitter.com/k8briz/status/117941207906...,...,Trump,,EDT,Well-what is it?,755855317246672896,755855317246672896,,,k8briz,Seattle
2,[],1179411261337067520,1570028697000,2019-10-02 11:04:57,3,,[],06,1179411954416439296,https://twitter.com/mfow020/status/11794119544...,...,Trump,,EDT,Enemy of the people!!!,987789810570969088,987789810570969088,,,mfow020,Seattle
3,[],1179411261337067520,1570028692000,2019-10-02 11:04:52,3,,[],05,1179411936582209536,https://twitter.com/dougalpollux/status/117941...,...,Trump,,EDT,Top of the morning 😃 Mr President,226099935,226099935,,,dougalpollux,Seattle
4,[],1179196668023054336,1570028671000,2019-10-02 11:04:31,2,,[],23,1179411847642062851,https://twitter.com/Craterdome/status/11794118...,...,Trump,,EDT,Is there any line Trump can cross that would m...,15481999,15481999,,,Craterdome,Seattle
5,[],1179407402933870592,1570028665000,2019-10-02 11:04:25,2,,[],21,1179411820974628871,https://twitter.com/TamaraZRoberson/status/117...,...,Trump,,EDT,So Trump and his supporters threatening to kil...,1028106497237049344,1028106497237049344,,,TamaraZRoberson,Seattle
9,[],1179411261337067520,1570028644000,2019-10-02 11:04:04,2,,[],15,1179411736170029056,https://twitter.com/Fragola_Girl/status/117941...,...,Trump,,EDT,Nope. Moot. pic.twitter.com/hwbHpQ6OQ0,1284836701,1284836701,,,Fragola_Girl,Seattle
10,[],1179411261337067520,1570028642000,2019-10-02 11:04:02,2,,[],15,1179411725378043907,https://twitter.com/Surf_City_Gal/status/11794...,...,Trump,,EDT,"... too late, Dotard... gotcha at “moot” pic.t...",81982013,81982013,,,Surf_City_Gal,Seattle
12,[],1179193225325826050,1570028639000,2019-10-02 11:03:59,2,,[],14,1179411714913243137,https://twitter.com/Oldschol69/status/11794117...,...,Trump,,EDT,Trump continues to violate the guidelines of n...,256108334,256108334,,,Oldschol69,Seattle
16,[],1179411530590371841,1570028595000,2019-10-02 11:03:15,2,,[],02,1179411530590371841,https://twitter.com/CindiLewis9/status/1179411...,...,Trump,,EDT,So how will trump harangue Bernie about his he...,292719867,292719867,,,CindiLewis9,Seattle


## Word Embeddings

### Tokenizing

In [59]:
target = main_df.City
data = main_df.tweet.map(word_tokenize).values

In [60]:
tot_vocab = set(word for twt in data for word in twt)
print(f"Total unique words in DF: {len(tot_vocab)}")

Total unique words in DF: 7994


In [67]:
glove = {}
with open('glove.twitter.27B.200d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in tot_vocab:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

### Mean Word Embeddings

In [68]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline. 
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

## Classifiers
* Dummy Classifier - Baseline Model
* Random Forest
* Logistic Regression

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier

rf =  Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
lr = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ('Logistic Regression', LogisticRegression())])
dum = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
               ("Dummy Classifier", DummyClassifier())])

In [70]:
models = [("Dummy Classifier", dum), 
          ('Random Forest', rf),
          ("Logistic Regression", lr)]

In [71]:
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [72]:
scores

[('Dummy Classifier', 0.5225840336134454),
 ('Random Forest', 0.5472689075630253),
 ('Logistic Regression', 0.5283613445378151)]

## Deep Learning and Keras

In [112]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

In [113]:
y = pd.get_dummies(target).values

In [161]:
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(main_df.tweet))
list_tokenized_headlines = tokenizer.texts_to_sequences(main_df.tweet)
X_t = sequence.pad_sequences(list_tokenized_headlines, maxlen=100)

In [162]:
embedding_size = 128
input_ = Input(shape=(100,))
x = Embedding(20000, embedding_size)(input_)
x = LSTM(25, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(25, activation='relu')(x)
x = Dropout(0.5)(x)
# There are 2 different possible classes, so we use 2 neurons in our output layer
x = Dense(2, activation='softmax')(x)

model = Model(inputs=input_, outputs=x)

In [168]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [169]:
model.summary()

Model: "model_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        (None, 100)               0         
_________________________________________________________________
embedding_16 (Embedding)     (None, 100, 128)          2560000   
_________________________________________________________________
lstm_16 (LSTM)               (None, 100, 25)           15400     
_________________________________________________________________
global_max_pooling1d_16 (Glo (None, 25)                0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 25)                0         
_________________________________________________________________
dense_31 (Dense)             (None, 25)                650       
_________________________________________________________________
dropout_32 (Dropout)         (None, 25)                0  

In [171]:
model.fit(X_t, y, epochs=10, batch_size=25, validation_split=0.3)

Train on 1332 samples, validate on 572 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a5e139588>