<a href="https://colab.research.google.com/github/OmarMeriwani/Fake-Financial-News-Detection/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis
This document contains the source code for the sentiment analysis model.

In [0]:
import numpy as np
from string import punctuation
import pandas as pd
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from sklearn.model_selection import train_test_split
from nltk.stem.porter import *
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding, GlobalMaxPooling1D
from keras.layers.merge import Concatenate
from keras.utils import np_utils
import os
from stanfordcorenlp import StanfordCoreNLP
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
java_path = "C:/Program Files/Java/jdk1.8.0_161/bin/java.exe"
os.environ['JAVAHOME'] = java_path
host='http://localhost'
port=9000
scnlp =StanfordCoreNLP(host, port=port,lang='en', timeout=30000)
stemmer = PorterStemmer()


Normalization steps, it includes tokenization, removing punctuation and stop words, and finally lemmatizing and lowercasing the words.

In [0]:
def clean_doc(doc):
    doc = doc.encode('ascii', errors='ignore').decode("utf-8")
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if len(word) > 1 and str(word).isalpha() == True ]
    return tokens


This method finds if a specific word exists in the vocabulary, then it assigns vector weights extracted from a ready embeddings to make the semantic vector representations. The words that does not exist in the vocabulary are replaced with zeros.

In [0]:
def get_weight_matrix2(embedding, vocab):
    vocab_size2 = len(vocab) + 1
    weight_matrix = zeros((vocab_size2, 300))
    for word, i in vocab:
        vector = None
        try:
            vector = embedding.get_vector(word)
        except:
            continue
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix


The method below reads a datasheet file, and performs the following tasks:
* Get the claim and the label.
* Replace named entities.
* Replace numbers.
* Remove stop words.
* Remove punctuation.
* Get POS Tags.
* Then create an array of sentences and labels


In [0]:
def readfile(filename):
    df = pd.read_csv(filename,header=0)

    data = pd.DataFrame(columns=['claim','label'])
    seq = 0
    table = str.maketrans('', '', punctuation)

    for i in range(0,len(df)):
        sentence = str(df.loc[i][1])
        tokens = scnlp.word_tokenize(sentence)
        sentenceList = []
        for word in tokens:
            isAllUpperCase = True
            for letter in word:
                if letter.isupper() == False:
                    isAllUpperCase = False
                    break

            if isAllUpperCase == False:
                sentenceList.append(str(word))
            else:
                sentenceList.append('#ner')
        tokens = sentenceList

        tokens = [w.translate(table) for w in tokens]
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        # filter out short tokens
        tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if len(word) > 1]
        sentence = ' '.join(tokens)
        NER = scnlp.ner(str(sentence))
        POS = scnlp.pos_tag(str(sentence).lower())
        sentenceList = []
        for i in range(0,len(NER)):
            w = NER[i][0]
            n = NER[i][1]
            pos = NER[i][1]
            if str(w).isnumeric() == True:
                sentenceList.append('#num')
                continue
            if pos == 'NNP' and w != '#ner':
                sentenceList.append('#ner')
                continue
            if str(n) == 'O' :
                sentenceList.append(w)
            else:
                sentenceList.append('#ner')
        sentence = ' '.join(sentenceList)
        label = int(df.loc[i][4])
        if sentence.strip() != '':
            data.loc[seq] = [sentence,label]
            print(sentence, label)
            seq += 1
    return data


Using the previous method, the data is loaded and separated into claims and labels. Note that it is required to download the dataset train.csv from the [link](https://www.kaggle.com/c/fake-news), the other dataset FakeNewsSA.csv already exists in the path, but it requires to change the indices of readfile method before changing the dataset.


In [0]:
data = readfile('Kaggle Competition/train.csv')
#data = readfile('FakeNewsSA.csv')
claims = data[['claim']]
labels = data[['label']]


Splitting the resulting sets to training and test, and converting labels into categorical set.

In [0]:
x_train, x_test, y_train, y_test = train_test_split(claims,labels,test_size=0.2)
traindata = np.array(x_train)
testdata = np.array(x_test)

y_testold = y_test
y_test = np_utils.to_categorical(y_test,num_classes=2)
print(y_testold, y_test)
y_train = np_utils.to_categorical(y_train,num_classes=2)


Selecting the required fields from test and training claims and converting them into sequences.

In [0]:
train_docs = traindata[:,0]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs)
test_docs = testdata[:,0]
encoded_docs = tokenizer.texts_to_sequences(train_docs)


Limiting the length of the resulting seqeunces and converting them into embedding vectors using Google news embeddings.

In [0]:
max_length = max([len(s.split()) for s in train_docs])
print('max_length', max_length)
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

encoded_docs = tokenizer.texts_to_sequences(test_docs)
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# load embedding from file
wv_from_bin = KeyedVectors.load_word2vec_format(datapath('E:/Data/GN/GoogleNews-vectors-negative300.bin'), binary=True)
embedding_vectors = get_weight_matrix2(wv_from_bin, tokenizer.word_index.items())

print('embedding_vectors.shape() =============================')
print(embedding_vectors.shape)

# create the embedding layer
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_vectors], input_length=max_length, trainable=False)


Deep neural network parameters and creating input layer.

In [0]:
embeding_dim = 300
filter_sizes = (1,2,3,4)
num_filters = 100
dropout_prob = (0.0, 0.5)
batch_size = 64
num_epochs = 500
print('max_length',max_length)
input_shape = (max_length,)
model_input = Input(shape=input_shape)
zz = embedding_layer(model_input)


Creating the model:

In [0]:
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(zz)
    conv = GlobalMaxPooling1D()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks if len(conv_blocks) > 1 else conv_blocks[0])
z = Dropout(0.8)(z)
model_output = Dense(10, activation="sigmoid" , bias_initializer='zeros')(z)
model_output = Dense(10)(model_output)
model_output = Dropout(0.8)(model_output)
model_output = Dense(2, activation="selu")(model_output)
model = Model(model_input, model_output)


Training and Evaluation

In [0]:
model.compile(loss="categorical_hinge", optimizer="adam", metrics=["accuracy"])
model.summary(85)
history = model.fit(Xtrain, y_train, batch_size=batch_size, epochs=50,
    validation_data=(Xtest, y_test), verbose=2)
print('History', history.history)
loss, acc = model.evaluate(Xtest, y_test, verbose=2)
print('Test Accuracy: %f' % (acc*100))
