In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
dataset_path='/content/gdrive/My Drive/Sentiment analysis/datasets'
model_path= '/content/gdrive/My Drive/Sentiment analysis/models'

In [None]:
import numpy as np 
import pandas as pd 
import bz2

In [None]:
def labels_text(x):
  label=[]
  texts=[]
  for line in bz2.BZ2File(x):
    decode = line.decode("utf-8")
    label.append(int(decode[9]) - 1)
    texts.append(decode[10:].strip())
  return np.array(label),texts

In [None]:
train_label, train_text = labels_text(dataset_path+'/train.ft.txt.bz2')
test_label, test_text = labels_text(dataset_path+'/test.ft.txt.bz2')

In [None]:
train_text[0]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [None]:
test_text[0]

'Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"'

In [None]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_text = normalize_texts(train_text)
test_text = normalize_texts(test_text)

In [None]:
from sklearn.model_selection import train_test_split
train_text, val_text, train_label, val_label = train_test_split(
    train_text, train_label, random_state=57643892, test_size=0.2)

In [None]:
import matplotlib.pyplot as plt
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_text)
train_text = tokenizer.texts_to_sequences(train_text)
val_text = tokenizer.texts_to_sequences(val_text)
test_text = tokenizer.texts_to_sequences(test_text)

In [None]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_text)
train_text = pad_sequences(train_text, maxlen=MAX_LENGTH)
val_text = pad_sequences(val_text, maxlen=MAX_LENGTH)
test_text = pad_sequences(test_text, maxlen=MAX_LENGTH)

In [None]:
import pickle

# saving
with open(model_path+'tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(model_path+'tokenizer.pickle', 'rb') as handle:
    tokenizer1 = pickle.load(handle)

In [None]:
def build_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

In [None]:
model.fit(
    train_text, 
    train_label, 
    batch_size=128,
    epochs=2,
    validation_data=(val_text, val_label), )

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f31ca92c7f0>

In [None]:
model_json = model.to_json()
with open("/content/gdrive/My Drive/Sentiment analysis/full_balanced_modelA76_epoch30.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("/content/gdrive/My Drive/Sentiment analysis/full_balanced_modelA76_epoch30.h5")
print("Saved model to disk")

Saved model to disk


In [None]:
model.evaluate(test_text,test_label)



[0.1438097506761551, 0.9466074705123901]

In [None]:
from sklearn.metrics import confusion_matrix
p=model.predict(test_text)
confusion_matrix(test_label,p.round())

array([[192056,   7944],
       [ 13361, 186639]])

In [None]:
from keras.models import model_from_json
json_file = open('/content/gdrive/MyDrive/Sentiment analysis/full_balanced_modelA76_epoch30.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights('/content/gdrive/MyDrive/Sentiment analysis/full_balanced_modelA76_epoch30.h5')

loaded_model.compile(loss='binary_crossentropy', optimizer='adam')
loaded_model.summary()
print("Loaded model from disk")

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 255)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 255, 64)           768000    
_________________________________________________________________
conv1d (Conv1D)              (None, 253, 64)           12352     
_________________________________________________________________
batch_normalization (BatchNo (None, 253, 64)           256       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 84, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 80, 64)            20544     
_________________________________________________________________
batch_normalization_1 (Batch (None, 80, 64)           

In [None]:
with open('/content/gdrive/MyDrive/Sentiment analysis/modelstokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
s = tokenizer.texts_to_sequences(["Amazing product ever seen"])
d = pad_sequences(s,maxlen=300)
loaded_model.predict(d)

array([[0.9809776]], dtype=float32)