In [1]:
import pandas as pd

In [2]:
news_df = pd.read_csv("../Data/Clean_data.csv")

In [3]:
news_df.head()

Unnamed: 0,Headline,Is_SentimentHeadline_Positive
0,obama lay wreath arlington national cemetery p...,0
1,tim haywood investment director businessunit h...,0
2,nouriel roubini nyu professor chairman roubini...,1
3,finland economy expand marginally month end de...,1
4,tourism public spending continue boost economy...,1


In [4]:
news_df = news_df.dropna()

## Split data into train and test sets

In [5]:
from sklearn.model_selection import train_test_split

X_train_nv, X_test_nv, y_train, y_test = train_test_split(news_df['Headline'], news_df['Is_SentimentHeadline_Positive'], 
                                                    train_size=0.8, 
                                                    random_state=42)

## Vectorization

In [6]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

vectorizer = TfidfVectorizer(max_features=4000, min_df=7, max_df=0.7, stop_words=stopwords.words('english'))
X_train = vectorizer.fit_transform(X_train_nv).toarray()
X_test = vectorizer.transform(X_test_nv)

# Feature Selection
chi2_selector = SelectKBest(chi2, k=2000)
X_train = chi2_selector.fit_transform(X_train, y_train)
X_test = chi2_selector.transform(X_test)
    

In [7]:
#Save the vectorizer and feature Selector
import pickle

pickle.dump(vectorizer, open("../App/Model/vectorizer.pkl", "wb"))
pickle.dump(chi2_selector, open("../App/Model/chi_selector.pkl", "wb"))

## Training the model

In [8]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.svm import SVC

LR = LogisticRegression()
SGDC = SGDClassifier()
RFC = RandomForestClassifier(n_estimators=300, random_state=0)
MNB = MultinomialNB()

# Logistic Regression
LR.fit(X_train, y_train)
LR_Model = LR.predict(X_test)
print("\nLinear Regression Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,LR_Model))
print("Classification Report: \n",classification_report(y_test,LR_Model))
print("Accuracy Score: \n",accuracy_score(y_test, LR_Model))

# Stochastic Gradient Descent
SGDC.fit(X_train, y_train)
SGDC_Model = SGDC.predict(X_test)
print("\nStochastic Gradient Descent Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,SGDC_Model))
print("Classification Report: \n",classification_report(y_test,SGDC_Model))
print("Accuracy Score: \n",accuracy_score(y_test, SGDC_Model))

# Random Forest Classifier 
RFC.fit(X_train, y_train)
RFC_Model = RFC.predict(X_test)
print("\nRandom Forest Classifier  Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,RFC_Model))
print("Classification Report: \n",classification_report(y_test,RFC_Model))
print("Accuracy Score: \n",accuracy_score(y_test, RFC_Model))

# Multinomial Naive Bayes 
MNB.fit(X_train, y_train)
MNB_Model = MNB.predict(X_test)
print("\nMultinomial Naive Bayes Algorithm\n")
print("Confusion Matrix: \n",confusion_matrix(y_test,MNB_Model))
print("Classification Report: \n",classification_report(y_test,MNB_Model))
print("Accuracy Score: \n",accuracy_score(y_test, MNB_Model))


Linear Regression Algorithm

Confusion Matrix: 
 [[5909 1161]
 [2097 2785]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.84      0.78      7070
           1       0.71      0.57      0.63      4882

    accuracy                           0.73     11952
   macro avg       0.72      0.70      0.71     11952
weighted avg       0.72      0.73      0.72     11952

Accuracy Score: 
 0.7274096385542169

Stochastic Gradient Descent Algorithm

Confusion Matrix: 
 [[6310  760]
 [2694 2188]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.70      0.89      0.79      7070
           1       0.74      0.45      0.56      4882

    accuracy                           0.71     11952
   macro avg       0.72      0.67      0.67     11952
weighted avg       0.72      0.71      0.69     11952

Accuracy Score: 
 0.7110107095046854

Random Forest Classifier  Algorithm

Confusion Matrix: 
 

In [9]:
# Saving the model
pickle.dump(LR, open("../App/Model/logistic_regression.pkl", "wb"))

## Model using Neural Network

### Cleaning the data and vectorization

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers

max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train_nv)
sequences = tokenizer.texts_to_sequences(X_train_nv)
X_train = pad_sequences(sequences, maxlen=max_len)


test_sequences = tokenizer.texts_to_sequences(X_test_nv)
X_test = pad_sequences(test_sequences, maxlen=max_len)


In [11]:
import tensorflow as tf
from keras.models import Sequential
from keras import layers
from keras.layers import *
from keras.layers.recurrent import LSTM
from keras.optimizers import Adam

### RNN Model

In [12]:
def get_rnn_model():
    model = Sequential()
    
    model.add(Embedding(5000, 100, input_length = 200))
    model.add(LSTM(256))
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(512, activation='relu'))
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.summary()
    
    return model

In [13]:
model1 = get_rnn_model()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 100)          500000    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               365568    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 256)               1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0

In [14]:
learning_rate = 0.001
optimizer = Adam(learning_rate)

model1.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=['accuracy'])

verbose = 1
epochs = 50
batch_size = 20
validation_split = 0.2
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model1.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=verbose,
    validation_data=(X_test, y_test),
    callbacks = [callback]
    )


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


<tensorflow.python.keras.callbacks.History at 0x2a830417430>

In [16]:
# predict probabilities for test set
y_classes = model1.predict_classes(X_test, verbose=1)
# reduce to 1d array
y_classes = y_classes[:, 0]

accuracy = accuracy_score(y_test, y_classes)
print('Accuracy: %f' % accuracy)

precision = precision_score(y_test, y_classes)
print('Precision: %f' % precision)

recall = recall_score(y_test, y_classes)
print('Recall: %f' % recall)

f1 = f1_score(y_test, y_classes)
print('F1 score: %f' % f1)


Accuracy: 0.716282
Precision: 0.674223
Recall: 0.590946
F1 score: 0.629844


### CNN Model

In [17]:
def get_cnn_model():   
    model = Sequential()
    
    model.add(Embedding(max_words, 100, input_length=200))
    
    model.add(Conv1D(1024, 3, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.summary()
    return model

In [18]:
model2 = get_cnn_model()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          500000    
_________________________________________________________________
conv1d (Conv1D)              (None, 198, 1024)         308224    
_________________________________________________________________
global_max_pooling1d (Global (None, 1024)              0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_5 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 2048)             

In [19]:
learning_rate = 0.001
optimizer = Adam(learning_rate)

model2.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=['accuracy'])

verbose = 1
epochs = 50
batch_size = 20
validation_split = 0.2
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model2.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=verbose,
    validation_data=(X_test, y_test),
    callbacks = [callback]
    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


<tensorflow.python.keras.callbacks.History at 0x2a837372d30>

In [20]:
# predict probabilities for test set
y_classes = model2.predict_classes(X_test, verbose=1)
# reduce to 1d array
y_classes = y_classes[:, 0]

accuracy = accuracy_score(y_test, y_classes)
print('Accuracy: %f' % accuracy)

precision = precision_score(y_test, y_classes)
print('Precision: %f' % precision)

recall = recall_score(y_test, y_classes)
print('Recall: %f' % recall)

f1 = f1_score(y_test, y_classes)
print('F1 score: %f' % f1)


Accuracy: 0.720716
Precision: 0.651551
Recall: 0.679844
F1 score: 0.665397
