In [6]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re
import numpy as np

import xgboost as xgb
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.metrics import accuracy_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from keras.models import Model, Sequential
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout, BatchNormalization, Embedding, Flatten, LSTM

In [7]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 50

In [8]:
data = pd.read_csv('../../data/tweet-data.csv', encoding='latin-1', header=None)

In [9]:
data.columns = ['sentiment', 'id', 'date', 'q', 'user', 'text']

In [10]:
data.shape

(1600000, 6)

In [11]:
data.head()

Unnamed: 0,sentiment,id,date,q,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [12]:
data = data.drop(['id', 'date', 'q', 'user'], axis=1)

In [13]:
data = data.sample(400000).reset_index(drop=True)

In [14]:
data.sentiment.value_counts()

0    200087
4    199913
Name: sentiment, dtype: int64

In [15]:
data.text[0]

'@sasu167 Th? thï¿½ anh ph?i ng? s?m ?i thï¿½i! ï¿½, nghe nh?c hay ??c sï¿½ch ï¿½! Mï¿½ nghe nh?c thï¿½i, ??c sï¿½ch l?i thï¿½m ?au ??u '

In [16]:
data['word_count'] = data['text'].apply(lambda x: len(str(x).split(" ")))

In [17]:
data.loc[data.sentiment == 0, 'word_count'].mean()

14.758265154657725

In [18]:
data.loc[data.sentiment == 4, 'word_count'].mean()

13.980201387603607

In [19]:
data.text = data.text.str.lower()

In [20]:
def clean_str(string):
    
    string = re.sub(r"https?\://\S+", '', string)
    string = re.sub(r"@\w*\s", '', string)
    string = re.sub(r'\<a href', ' ', string)
    string = re.sub(r'&amp;', '', string) 
    string = re.sub(r'<br />', ' ', string)
    string = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', string)
    string = re.sub('\d','', string)
    return string

In [21]:
data.text = data.text.apply(lambda x: clean_str(str(x)))

In [22]:
pd.Series(' '.join(data['text']).split()).value_counts().head(10)

i      192079
to     141368
the    130698
a       95279
my      79011
and     75568
you     68724
is      59387
it      59007
in      54257
dtype: int64

In [23]:
stop_words = stopwords.words('english')
stop_words = set(stop_words)
remove_stop_words = lambda r: [[word for word in word_tokenize(sente) if word not in stop_words] for sente in sent_tokenize(r)]
data['text'] = data['text'].apply(remove_stop_words)

In [24]:
def combine_text(text):    
    try:
        return ' '.join(text[0])
    except:
        return np.nan

In [25]:
data.text = data.text.apply(lambda x: combine_text(x))

In [26]:
data = data.dropna(how='any')

In [27]:
pd.Series(' '.join(data['text']).split()).value_counts().head(10)

's      44902
n't     43788
'm      33044
good    22963
day     22265
get     20565
like    19671
go      18353
quot    17808
got     17641
dtype: int64

In [28]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(list(data['text']))
sequences = tokenizer.texts_to_sequences(data['text'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 127452 unique tokens.


In [29]:
tweets = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [30]:
def load_embedding(filename, word_index , num_words, embedding_dim):
    embeddings_index = {}
    file = open(filename, encoding="utf-8")
    for line in file:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:])
        embeddings_index[word] = coef
    file.close()
    
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, pos in word_index.items():
        if pos >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[pos] = embedding_vector
    return embedding_matrix

In [31]:
embedding_matrix = load_embedding('../../embedding/glove.twitter.27B.50d.txt', word_index, len(word_index), EMBEDDING_DIM)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(tweets, pd.get_dummies(data.sentiment), test_size=0.2, random_state=9)

In [36]:
embedding_layer = Embedding(len(word_index),
                    EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2))
model.add(Dense(2, activation='softmax'))


model.compile(loss='binary_crossentropy', optimizer='sgd', metrics = ['acc'])

In [37]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            6367450   
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 6,428,052
Trainable params: 60,602
Non-trainable params: 6,367,450
_________________________________________________________________


In [38]:
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs=10, batch_size=256)

Train on 319382 samples, validate on 79846 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f445a1fb00>

In [39]:
preds = model.predict(X_test)

In [40]:
accuracy_score(np.argmax(preds, 1), np.argmax(y_test.values, 1))

0.6978683966635774

In [86]:
y_actual = pd.Series(np.argmax(y_test.values, axis=1), name='Actual')
y_pred = pd.Series(np.argmax(preds, axis=1), name='Predicted')
pd.crosstab(y_actual, y_pred, margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,25187,14776,39963
1,9348,30535,39883
All,34535,45311,79846


In [85]:
review_num = 32
print("Tweet: \n"+tokenizer.sequences_to_texts([X_test[review_num]])[0])
sentiment = "Positive" if np.argmax(preds[review_num]) else "Negative"
print("\nPredicted sentiment = "+ sentiment)
sentiment = "Positive" if np.argmax(y_test.values[review_num]) else "Negative"
print("\nActual sentiment = "+ sentiment)

Tweet: 
google actually didnt solve problem

Predicted sentiment = Negative

Actual sentiment = Negative
