In [1]:
import fasttext
import math
import re
import pandas as pd
import random as rd
import nltk
import pickle
import itertools
import numpy as np
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
from collections import Counter
import matplotlib.pyplot as plt

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Word2vec
import gensim

# Keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Activation, MaxPooling1D, Dropout

In [2]:
# nltk.download('stopwords')
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8
# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024
# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)
# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [3]:
df = pd.read_csv(r"../../../Datasets/sentiment140_tweet.csv", encoding =DATASET_ENCODING , names=DATASET_COLUMNS)
df.head()
print("Dataset size:", len(df))

Dataset size: 1600000


In [4]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

In [5]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))
#documents = [_text.split() for _text in df_train.text] 

TRAIN size: 1280000
TEST size: 320000


In [6]:
train_size = 30000
test_size = 10000

df = pd.DataFrame(None)

texts_train = df_train["text"][:train_size].tolist()
ydata_train = df_train["target"][:train_size].tolist()
df_train = pd.DataFrame(None)

texts_test = df_test["text"][:test_size].tolist()
ydata_test = df_test["target"][:test_size].tolist()
df_test = pd.DataFrame(None)


local_model_file = "../../models/inpsmt_e15_d200t200.bin"
external_hdd = "/media/omer/Seagate Backup Plus Drive/OMER/FastText Models/"
metal_flash = "/media/omer/UBUNTU 20_0/NOVA/models/"

model = fasttext.load_model(metal_flash + "inpsmt_e15_d200t100.bin")

def one_hot_encode(alist):
    ret = []
    for value in alist:
        vec = [0, 0, 0, 0, 0]
        vec[value] = 1
        ret.append(vec)
    return ret

# Turn text into vectors
xdata_train, xdata_test = [], []
for text in texts_train:
    xdata_train.append(model.get_sentence_vector(text))
    
for text in texts_test:
    xdata_test.append(model.get_sentence_vector(text))
    
x_train = np.array(xdata_train)
y_train = np.array(one_hot_encode(ydata_train))

x_test = np.array(xdata_test)
y_test = np.array(one_hot_encode(ydata_test))

del xdata_train[:]
del xdata_test[:]
del ydata_train[:]
del ydata_test[:]



In [7]:
x_train = np.reshape(x_train,(-1,x_train.shape[1], 1))
x_test = np.reshape(x_test,(-1, x_test.shape[1], 1))

In [8]:
dim = len(x_train[0])

#create model
model = Sequential()#add model layers

model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(dim, 1)))
model.add(MaxPooling1D(2))
#model.add(Dropout(0.5))
#model.add(Conv1D(64, 3, activation='relu'))
#model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [9]:
model.fit(x_train, y_train, epochs=20, verbose=2)

Epoch 1/20
938/938 - 4s - loss: 0.6580 - accuracy: 0.6120
Epoch 2/20
938/938 - 4s - loss: 0.6189 - accuracy: 0.6533
Epoch 3/20
938/938 - 4s - loss: 0.6150 - accuracy: 0.6585
Epoch 4/20
938/938 - 4s - loss: 0.6097 - accuracy: 0.6616
Epoch 5/20
938/938 - 4s - loss: 0.6080 - accuracy: 0.6626
Epoch 6/20
938/938 - 4s - loss: 0.6039 - accuracy: 0.6665
Epoch 7/20
938/938 - 4s - loss: 0.5999 - accuracy: 0.6674
Epoch 8/20
938/938 - 4s - loss: 0.5965 - accuracy: 0.6711
Epoch 9/20
938/938 - 4s - loss: 0.5946 - accuracy: 0.6716
Epoch 10/20
938/938 - 4s - loss: 0.5904 - accuracy: 0.6757
Epoch 11/20
938/938 - 4s - loss: 0.5862 - accuracy: 0.6804
Epoch 12/20
938/938 - 4s - loss: 0.5826 - accuracy: 0.6803
Epoch 13/20
938/938 - 4s - loss: 0.5783 - accuracy: 0.6869
Epoch 14/20
938/938 - 4s - loss: 0.5745 - accuracy: 0.6901
Epoch 15/20
938/938 - 4s - loss: 0.5685 - accuracy: 0.6975
Epoch 16/20
938/938 - 4s - loss: 0.5648 - accuracy: 0.6985
Epoch 17/20
938/938 - 4s - loss: 0.5601 - accuracy: 0.7027
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7ff34e63fd60>

In [10]:
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

test loss, test acc: [0.6004053354263306, 0.6783000230789185]
