In [110]:
# http://help.sentiment140.com/for-students/
import nltk
import numpy as np
import pandas as pd

In [111]:
labels = {0: 'negative', 2: 'neutral', 4: 'positive'}

columns = ['Label','Id','Timestamp','Query','User','Text']
df = pd.read_csv('../data/external/traindata_processed.csv', encoding_errors='ignore', header=None)
df = df.sample(200000)
df = df.rename(columns=dict(zip(range(len(columns)), columns)))
df['Sentiment'] = df['Label'].map(labels)

print(df.shape)
print(df['Label'].value_counts())

(200000, 7)
4    100120
0     99880
Name: Label, dtype: int64


In [112]:
# Text Preprocessing
## Lowercasing
df['Text'] = df['Text'].str.lower()

## Remove Special Chars
df['Text'] = df['Text'].str.replace(r'^rt\s+@\w+:', '', regex=True)   
df['Text'] = df['Text'].str.replace(r'(http|@)\S+', '', regex=True)   
df['Text'] = df['Text'].str.replace(r'[^a-z\':_]', ' ', regex=True)

## Transform short negation form
df['Text'] = df['Text'].str.replace(r"(can't|cannot)", "can not", regex=True)
df['Text'] = df['Text'].str.replace(r"n't", " not", regex=True)

## Remove Stopwords
stopwords = nltk.corpus.stopwords.words('english')
for word in ['not', 'nor', 'no']:
    stopwords.remove(word)
    
df['Text'] = df['Text'].apply(
  lambda x: ' '.join([word for word in x.split() if word not in stopwords])
)

In [113]:
from sklearn.model_selection import train_test_split

X, y = df['Text'], np.where(df['Label'] == 0, 0, 1) # negative = 0, positive = 1
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=1, shuffle=True)

print(X_train.shape, y_train.shape)

(180000,) (180000,)


In [114]:
import tensorflow as tf
from tensorflow import keras

VOCAB_SIZE = 8500
OUTPUT_SEQUENCE_LENGTH = 40
BATCH_SIZE = 64
EPOCHS = 1
EMBED_DIM = 256

def get_vectorizer(text_data:pd.Series) -> keras.layers.TextVectorization:
    vectorizer = keras.layers.TextVectorization(
        max_tokens=VOCAB_SIZE,
        output_sequence_length=OUTPUT_SEQUENCE_LENGTH,
        standardize=None
    )
    vectorizer.adapt(text_data)
    return vectorizer

vectorizer = get_vectorizer(X)

def vectorize_text(inputs) -> tf.Tensor:
    return vectorizer(inputs)

X_train = vectorize_text(list(X_train))
X_valid = vectorize_text(list(X_valid))

In [124]:
# Modeling

inputs = keras.Input(shape=(None, ), dtype='int64', name='input_layer')
embedding = keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM)(inputs)
bi_gru = keras.layers.Bidirectional(
    keras.layers.GRU(64, return_sequences=True)
)(embedding)
pooling = keras.layers.GlobalMaxPooling1D()(bi_gru)
dropout = keras.layers.Dropout(0.3)(pooling)
outputs = keras.layers.Dense(1, activation='sigmoid')(dropout)

model = keras.Model(inputs, outputs)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 256)         2176000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 128)         123648    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,299,777
Trainable params: 2,299,777
Non-trainable params: 0
_________________________________________________

In [125]:
EPOCHS = 1

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)



In [169]:
labels = np.array(['negative', 'positive'])

import re
def preprocess_text(sentence:str) -> str:
    sentence = sentence.lower()
    sentence = re.sub(r'^rt\s+@\w+:', '', sentence)
    sentence = re.sub(r'(http|@)\S+', '', sentence)
    sentence = re.sub(r'[^a-z\':_]', ' ', sentence)
    sentence = re.sub(r"(can't|cannot)", "can not", sentence)
    sentence = re.sub(r"n't", " not", sentence)
    sentence = re.sub("\s{2,}", " ", sentence)
    return sentence.strip()

# sentence = "I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
sentence = "I'm a happy person and to celebrate that, I wrote about having an #awesome, #happy life."

sentence = preprocess_text(sentence)
sentence = vectorize_text([sentence])
preds = model.predict(sentence)[0][0]

print(preds)
print(labels[np.where(preds < 0.5, 0, 1)])




0.97724915
positive
