In [1]:
#Utility
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import re

#Natural Language Toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#W2V
import gensim
from gensim.models import KeyedVectors

#Keras
from tensorflow import keras 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras.models import Sequential
from keras.callbacks import ReduceLROnPlateau, EarlyStopping


In [2]:
columns = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv("../input/sentiment140/training.1600000.processed.noemoticon.csv", 
                 encoding ="ISO-8859-1" , 
                 names=columns)

In [3]:
#01. Pre-Processing:
#01.00.
positive = "positive"
negative = "negative"
neutral = "neutral"

decode_map = {0: "negative", 2: "neutral", 4: "positive"}

def decode_sentiment(label):
    return decode_map[int(label)]
df['target'] = df.target.apply(lambda x: decode_sentiment(x))

#01.01.
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
txt_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def preprocess(text, stem=False):
    text = re.sub(txt_re, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)
df['text'] = df['text'].apply(lambda x: preprocess(x))

#01.02.
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
#02. Processing: 
   
#02.01.
documents = [_text.split() for _text in df_train.text] 
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=300, 
                                            window=7, 
                                            min_count=10, 
                                            workers=8)
w2v_model.build_vocab(documents)

words = list(w2v_model.wv.index_to_key)
vocab_size = len(words)
w2v_model.train(documents, 
                total_examples=len(documents), 
                epochs=32)

#02.02.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)
vocab_size = len(tokenizer.word_index) + 1

#02.03.
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=300)

#02.04
labels = df_train.target.unique().tolist()
labels.append(neutral)

encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [5]:
#Test on example (love)
print("associated with love", w2v_model.wv.most_similar("love"), "\n")
print("associated with hate", w2v_model.wv.most_similar("hate"), "\n")
print("associated with cute", w2v_model.wv.most_similar("cute"), "\n")
print("associated with fun", w2v_model.wv.most_similar("fun"), "\n")

In [10]:
#03. Model:
#03.01.
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]


embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

#03.02.
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

#03.03. 
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

#03.03.
history = model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=3,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)
#03.04.
score = model.evaluate(x_test, y_test, batch_size=1024)

#03.05.
SENTIMENT_THRESHOLDS = (0.4, 0.7)
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = neutral
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = negative
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = positive
        return label
    else:
        return negative if score < 0.5 else positive

def predict(text, include_neutral=True):
    start_at = time.time()
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    score = model.predict([x_test])[0]
    label = decode_sentiment(score, include_neutral=include_neutral)
    return {"label": label, "score": float(score), "elapsed_time": time.time()-start_at}  

In [11]:
#04.Classification report on a confusion matrix.

y_pred_1d = []
y_test_1d = list(df_test.target)
scores = model.predict(x_test, verbose=1, batch_size=8000)
y_pred_1d = [decode_sentiment(score, include_neutral=False) for score in scores]


print(classification_report(y_test_1d, y_pred_1d))

In [13]:
import time

twt= "Hands down the best chicken wings I have ever eaten. My partner and I were visiting some friends in the area and these heavenly wings (and a couple of sides of rice) were the only thing requested."

predict(twt, True)