Below are various definitions and libraries required

In [1]:
import pandas as pd

# NLTK
import nltk
from nltk.corpus import stopwords # a,an,the,etc.
from  nltk.stem import SnowballStemmer

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Word2vec
import gensim

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping


# UTILITY
import re
import numpy as np
import logging
import os

Using TensorFlow backend.


In [2]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shubhammankar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#Open the input file 
fileName = os.listdir("./input")[0]
datasetPath = os.path.join(".","input",fileName)
print("Open file:", datasetPath)
df = pd.read_csv(datasetPath, encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

Open file: ./input/training.1600000.processed.noemoticon.csv


In [None]:
print(len(df))

In [5]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [6]:
%%time
df.target = df.target.apply(lambda x: decode_sentiment(x))

CPU times: user 750 ms, sys: 14.7 ms, total: 764 ms
Wall time: 785 ms


In [7]:
df.head(5)

Unnamed: 0,target,ids,date,flag,user,text
0,NEGATIVE,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,NEGATIVE,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,NEGATIVE,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,NEGATIVE,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,NEGATIVE,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
stopwords = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [9]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stopwords:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [10]:
df.text = df.text.apply(lambda x: preprocess(x))

In [11]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 1280000
TEST size: 320000


In [12]:
documents = [_text.split() for _text in df_train.text] 

In [13]:
len(documents)

1280000

In [14]:
model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

In [15]:
model.build_vocab(documents)

In [16]:
words = model.wv.vocab.keys()
vocab_size = len(words)
print(vocab_size)

30369


In [17]:
model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

(263132120, 295270528)

In [18]:
model.wv.most_similar('bad')

[('good', 0.5673845410346985),
 ('horrible', 0.5622835159301758),
 ('terrible', 0.5547279119491577),
 ('awful', 0.5265817642211914),
 ('shitty', 0.509876012802124),
 ('crappy', 0.4618406593799591),
 ('like', 0.4380093216896057),
 ('worse', 0.4246855080127716),
 ('worst', 0.41736918687820435),
 ('sad', 0.4143737256526947)]

In [24]:
print("hey")

hey
