## Applying Transfer Learning using Bag of words model

In [0]:
import pandas as pd
import numpy as np
import os
import pickle
import nltk
from nltk.corpus import stopwords
from keras import preprocessing
from keras.preprocessing import text,sequence
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import Dense

## Data Preparation
loading the dataset

In [0]:
imdbTrainPositive = pickle.load(open('imdbTestPositive.pkl', 'rb'))
imdbTestPositive = pickle.load(open('imdbTestPositive.pkl', 'rb'))
imdbTrainNegative= pickle.load(open('imdbTrainNegative.pkl', 'rb'))
imdbTestNegative = pickle.load(open('imdbTestNegative.pkl', 'rb'))

In [0]:
labTrainPos = [1] * len(imdbTrainPositive)
labTrainNeg = [0] * len(imdbTrainNegative)
totalTrainData = imdbTrainPositive + imdbTrainNegative
TotalTrainlabels = labTrainPos + labTrainNeg

In [0]:
train_df = pd.DataFrame({'Text': totalTrainData, 'Sentiment': TotalTrainlabels})

In [0]:
train_data = np.asarray(train_df['Text'])
train_labels = np.asarray(train_df['Sentiment'])

In [0]:
tokenizer = Tokenizer(num_words=250)
tokenizer.fit_on_texts(train_data)
x_train = tokenizer.texts_to_matrix(train_data, mode='freq')

## Model Designing

In [0]:
numWords = x_train.shape[1]

model = Sequential()

model.add(Dense(50, input_shape=(numWords,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, train_labels,
                    epochs=5,
                    batch_size=32,
                    validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 50)                12550     
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 51        
Total params: 12,601
Trainable params: 12,601
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Importing EDGAR dataset json file

In [0]:
json_df=pd.read_json("cleanedDataset.json")

In [0]:
json_df['sentiment'].value_counts()

Neutral     780
Positive    608
Negative    147
Name: sentiment, dtype: int64

In [0]:
json_df = json_df[json_df['sentiment'] != 'Neutral']
json_df['sentiment'] = json_df['sentiment'].apply(lambda x: 0 if x == 'Negative' else 1)
train_size = int(len(json_df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(json_df) - train_size))

Train size: 604
Test size: 151


Total sentiment count according to category

In [0]:
json_df['sentiment'].value_counts()

1    608
0    147
Name: sentiment, dtype: int64

### Preprocessing the EDGAR dataset

In [0]:
train_posts = json_df['text'][:train_size]
train_tags = json_df['sentiment'][:train_size]

test_posts = json_df['text'][train_size:]
test_tags = json_df['sentiment'][train_size:]

In [0]:
json_lables = json_df['sentiment']
json_text = json_df['text']
maxlen = 10000
tokenizer = Tokenizer()
testing_sequences = tokenizer.texts_to_sequences(json_text)
testing_sequences = preprocessing.sequence.pad_sequences(testing_sequences, maxlen=maxlen)

In [0]:
max_words = 256
batch_size = 24
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [0]:
tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

### Testing model accuracy on EDGAR dataset

In [0]:
score = model.evaluate(testing_sequences, json_lables,
                       batch_size=batch_size, verbose=1)
print('Test score using EDGAR:', score[0])
print('Test accuracy using EDGAR:', score[1])

Test score using EDGAR: 2.781882445859593
Test accuracy using EDGAR: 0.1947019888865237
