In [14]:
%%time
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

CPU times: user 1.72 ms, sys: 0 ns, total: 1.72 ms
Wall time: 1.73 ms


# Nouvelle section

In [15]:
%%time
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
CPU times: user 35 µs, sys: 1.02 ms, total: 1.06 ms
Wall time: 1.07 ms


True

In [18]:
%%time
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 10
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 11 µs


In [19]:
%%time
df = pd.read_csv("Atraining.1600000.processed.noemoticon.csv", encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

ParserError: ignored

In [5]:
%%time
print("Dataset size:", len(df))

NameError: ignored

In [6]:
%%time
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.63 µs


In [7]:
%%time
df.target = df.target.apply(lambda x: decode_sentiment(x))

NameError: ignored

In [None]:
%%time
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [8]:
%%time
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 10.5 µs


In [9]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))

NameError: ignored

In [10]:
%%time
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

NameError: ignored

In [None]:
%%time
documents = [_text.split() for _text in df_train.text] 

CPU times: user 3.09 s, sys: 403 ms, total: 3.5 s
Wall time: 3.49 s


In [11]:
%%time
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

CPU times: user 784 µs, sys: 0 ns, total: 784 µs
Wall time: 794 µs


In [12]:
%%time
w2v_model.build_vocab(documents)

NameError: ignored

In [None]:
%%time
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 30369
CPU times: user 0 ns, sys: 995 µs, total: 995 µs
Wall time: 900 µs


In [None]:
%%time
from gensim.models import Word2Vec
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)
w2v_model.save("word2vec.model")
#w2v_model = Word2Vec.load("word2vec.model")

2021-12-09 15:17:54,342 : INFO : loading Word2Vec object from word2vec.model
2021-12-09 15:17:54,950 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2021-12-09 15:17:54,952 : INFO : setting ignored attribute vectors_norm to None
2021-12-09 15:17:54,953 : INFO : loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
2021-12-09 15:17:54,957 : INFO : loading trainables recursively from word2vec.model.trainables.* with mmap=None
2021-12-09 15:17:54,960 : INFO : setting ignored attribute cum_table to None
2021-12-09 15:17:54,966 : INFO : loaded word2vec.model


CPU times: user 575 ms, sys: 100 ms, total: 675 ms
Wall time: 686 ms


In [None]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 290419
CPU times: user 19.3 s, sys: 117 ms, total: 19.4 s
Wall time: 19.4 s


In [None]:
%%time
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)

CPU times: user 29.4 s, sys: 1.44 s, total: 30.8 s
Wall time: 30.1 s


In [None]:
%%time
labels = df_train.target.unique().tolist()
labels.append(NEUTRAL)
labels

CPU times: user 60.2 ms, sys: 967 µs, total: 61.2 ms
Wall time: 60.7 ms


In [None]:
%%time
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

y_train (1280000, 1)
y_test (320000, 1)
CPU times: user 1.32 s, sys: 31 ms, total: 1.35 s
Wall time: 1.36 s


In [None]:
%%time
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (1280000, 300)
y_train (1280000, 1)

x_test (320000, 300)
y_test (320000, 1)
CPU times: user 1.23 ms, sys: 7 µs, total: 1.24 ms
Wall time: 912 µs


In [None]:
y_train[:10]

array([[1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [None]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(290419, 300)


In [None]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()





Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          87125700  
                                                                 
 dropout (Dropout)           (None, 300, 300)          0         
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 87,286,201
Trainable params: 160,501
Non-trainable params: 87,125,700
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [None]:
%%time
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=8,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/3




Epoch 2/3




Epoch 3/3




CPU times: user 1h 13min 42s, sys: 7min 6s, total: 1h 20min 49s
Wall time: 59min 43s


In [None]:
%%time
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])


ACCURACY: 0.7871687412261963
LOSS: 0.45198339223861694
CPU times: user 54.6 s, sys: 13.5 s, total: 1min 8s
Wall time: 1min 13s


In [None]:
from tensorflow.keras.models import load_model
model2 = load_model('myModel1.h5')





In [None]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [None]:
def predict(text, include_neutral=True):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model2.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}  

In [None]:
predict("I love the music")

{'elapsed_time': 0.19641971588134766,
 'label': 'POSITIVE',
 'score': 0.9754730463027954}

In [None]:
predict("I hate the rain blue")

{'elapsed_time': 0.16640996932983398,
 'label': 'NEGATIVE',
 'score': 0.021492797881364822}

In [None]:
predict("i say who is poutine")

{'elapsed_time': 0.2335984706878662,
 'label': 'NEUTRAL',
 'score': 0.6591560244560242}

In [None]:
model.save('myModel1.h5')