## Library Import

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

## Data Import and Process
Data source: https://www.kaggle.com/aaron7sun/stocknews

In [3]:
data = pd.read_csv('Combined_News_DJIA.csv')
# 8:2 split proportion
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
trainheadlines = []
for row in range(0, len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))
testheadlines = []
for row in range(0, len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))

# People are more likely to react to the bad news than to the good news
# We set a high threshold, which means that the news should be good enough so that the DJIA will rise
THRESHOLD = 0.7

## Modeling
(All the GridSearchCV are done on the Google Cloud Virtual Machine to get the optimal parameters of each model with the highest accuracy)

### Logistic Regression

1. CountVectorizer(single word) + Logistic Regression(C = 0.1, solver = 'lbfgs')

In [None]:
param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              "logisticregression__solver": ['newton-cg', 'lbfgs', 'sag'],
              "countvectorizer__ngram_range": [(1,1), (1,2), (1,3), (2,2), (2,3)],
              "countvectorizer__analyzer": ['char', 'char_wb', 'word']
             }
grid = GridSearchCV(make_pipeline(CountVectorizer(max_features = 200000), LogisticRegression(),
                                  memory="cache_folder"),
                    param_grid=param_grid, cv=5, scoring = 'accuracy'
                   )
grid.fit(trainheadlines, train['Label'])
grid.best_params_

# results:
# {'countvectorizer__analyzer': 'char',
#  'countvectorizer__ngram_range': (1, 1),
#  'logisticregression__C': 0.1,
#  'logisticregression__solver': 'lbfgs'}

In [4]:
basicvectorizer = CountVectorizer(max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = LogisticRegression(C = 0.1, solver = 'lbfgs')
basicmodel = basicmodel.fit(basictrain, train["Label"])
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))



0.4444444444444444


In [5]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords, 
                        'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(5)

Unnamed: 0,Word,Coefficient
19419,nigeria,0.290278
25261,self,0.25195
15998,korea,0.245221
29286,tv,0.239821
26323,so,0.235939


In [6]:
coeffdf.tail(5)

Unnamed: 0,Word,Coefficient
16949,low,-0.259891
7139,country,-0.278557
25433,sex,-0.285887
24754,sanctions,-0.309829
24542,run,-0.341968


2. TfidfVectorizer(two connected words) + Logistic Regression(solver = 'newton-cg')

In [None]:
param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              "logisticregression__solver": ['newton-cg', 'lbfgs', 'sag'],
              "tfidfvectorizer__ngram_range": [(1,1), (1,2), (1,3), (2,2), (2,3)],
              "tfidfvectorizer__analyzer": ['char', 'word']
             }
grid = GridSearchCV(make_pipeline(TfidfVectorizer(max_features = 200000), LogisticRegression(),
                                  memory="cache_folder"),
                    param_grid=param_grid, cv=5, scoring = 'accuracy'
                   )
grid.fit(trainheadlines, train['Label'])
grid.best_params_

# results:
# {'logisticregression__C': 1,
#  'logisticregression__solver': 'newton-cg',
#  'tfidfvectorizer__analyzer': 'word',
#  'tfidfvectorizer__ngram_range': (2, 2)}

In [7]:
basicvectorizer = TfidfVectorizer(ngram_range = (2,2), max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = LogisticRegression(solver = 'newton-cg')
basicmodel = basicmodel.fit(basictrain, train["Label"])
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))

0.49206349206349204


In [8]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords, 
                        'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(5)

Unnamed: 0,Word,Coefficient
4932,and other,0.423823
108497,right to,0.400289
121841,set to,0.343416
149475,the first,0.33725
151294,the pope,0.329685


In [9]:
coeffdf.tail(5)

Unnamed: 0,Word,Coefficient
29642,if he,-0.330328
30479,in gaza,-0.353752
192198,with iran,-0.367594
10093,bin laden,-0.390027
148766,the country,-0.412813


3. (Best Model)  CountVectorizer(two and three connected words) + Logistic Regression

In [10]:
basicvectorizer = CountVectorizer(ngram_range = (2,3), max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["Label"])
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))



0.5793650793650794


In [11]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords, 
                        'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(5)

Unnamed: 0,Word,Coefficient
160852,right to,0.296239
170796,set to,0.291049
7012,and other,0.285157
179603,the first,0.280615
45219,in south,0.268066


In [12]:
coeffdf.tail(5)

Unnamed: 0,Word,Coefficient
191551,up in,-0.240302
183528,there is,-0.245284
186761,to kill,-0.24878
197466,with iran,-0.251981
178908,the country,-0.350941


In [14]:
pd.DataFrame(pd.groupby(pd.DataFrame({'act': test['Label'], 'pre': predictions}),['act','pre']).size())

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,0
act,pre,Unnamed: 2_level_1
0,0,136
0,1,50
1,0,109
1,1,83


### Naive Bayes

1. CountVectorizer(one or two connected words) + Naive Bayes(alpha = 1)

In [None]:
param_grid = {"countvectorizer__ngram_range": [(1,1), (1,2), (1,3), (2,2), (2,3)],
              "countvectorizer__analyzer": ['char', 'char_wb', 'word'],
              "multinomialnb__alpha": [1, 0.1, 0.01, 0.001, 0.0001]
             }
grid = GridSearchCV(make_pipeline(CountVectorizer(max_features = 200000), MultinomialNB(),
                                  memory="cache_folder"),
                    param_grid=param_grid, cv=5, scoring = 'accuracy'
                   )
grid.fit(trainheadlines, train['Label'])
grid.best_params_

# results: 
# {'countvectorizer__analyzer': 'word',
#  'countvectorizer__ngram_range': (1, 2),
#  'multinomialnb__alpha': 1}

In [15]:
basicvectorizer = TfidfVectorizer(ngram_range = (1,2), max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = MultinomialNB(alpha = 1)
basicmodel = basicmodel.fit(basictrain, train["Label"])
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))

0.5105820105820106


In [16]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords, 
                        'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(5)

Unnamed: 0,Word,Coefficient
144244,the,-7.692195
155473,to,-7.901651
67515,of,-8.034307
38333,in,-8.034625
5191,and,-8.6083


In [17]:
coeffdf.tail(5)

Unnamed: 0,Word,Coefficient
199990,zumas,-12.300006
199991,zumas trademark,-12.300006
199995,zurich will,-12.300006
199996,zuyevo,-12.300006
199997,zuyevo rice,-12.300006


2. TfidfVectorizer(one or two connected words) + Naive Bayes(alpha = 0.1)

In [None]:
param_grid = {#"tfidfvectorizer__min_df": np.arange(0, 0.5, 0.01),
              #"tfidfvectorizer__max_df": np.arange(0.5, 1, 0.01),
              "tfidfvectorizer__ngram_range": [(1,1), (1,2), (1,3), (2,2), (2,3)],
              "tfidfvectorizer__analyzer": ['char', 'word'],
              "multinomialnb__alpha": [1, 0.1, 0.01, 0.001, 0.0001]
             }
grid = GridSearchCV(make_pipeline(TfidfVectorizer(max_features = 200000), MultinomialNB(),
                                  memory="cache_folder"),
                    param_grid=param_grid, cv=5, scoring = 'accuracy'
                   )
grid.fit(trainheadlines, train['Label'])
grid.best_params_

# results: 
# {'multinomialnb__alpha': 0.1,
#  'tfidfvectorizer__analyzer': 'word',
#  'tfidfvectorizer__ngram_range': (1, 2)}

In [18]:
basicvectorizer = TfidfVectorizer(ngram_range = (1,2), max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = MultinomialNB(alpha = 0.1)
basicmodel = basicmodel.fit(basictrain, train["Label"])
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))

0.4947089947089947


In [19]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords, 
                        'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(5)

Unnamed: 0,Word,Coefficient
144244,the,-5.990242
155473,to,-6.201811
67515,of,-6.336056
38333,in,-6.336378
5191,and,-6.920023


In [20]:
coeffdf.tail(5)

Unnamed: 0,Word,Coefficient
199990,zumas,-12.891621
199991,zumas trademark,-12.891621
199995,zurich will,-12.891621
199996,zuyevo,-12.891621
199997,zuyevo rice,-12.891621


### Random Forest

1. CountVectorizer(single words) + Random Forest(n_estimators = 200, max_depth = 8)

In [None]:
param_grid = {"countvectorizer__ngram_range": [(1,1), (1,2), (1,3), (2,2), (2,3)],
              "countvectorizer__analyzer": ['char', 'char_wb', 'word'],
              "randomforestclassifier__n_estimators": [200, 500, 700],
              "randomforestclassifier__max_features": ['auto', 'sqrt', 'log2'],
              "randomforestclassifier__max_depth": [4, 5, 6, 7, 8]
             }
grid = GridSearchCV(make_pipeline(CountVectorizer(max_features = 200000), RandomForestClassifier(),
                                  memory="cache_folder"),
                    param_grid=param_grid, cv=5, scoring = 'accuracy'
                   )
grid.fit(trainheadlines, train['Label'])
grid.best_params_

# results:
# {'countvectorizer__analyzer': 'char',
#  'countvectorizer__ngram_range': (1, 1),
#  'randomforestclassifier__max_depth': 8,
#  'randomforestclassifier__max_features': 'auto',
#  'randomforestclassifier__n_estimators': 200}

In [21]:
basicvectorizer = CountVectorizer(ngram_range = (1, 1), max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = RandomForestClassifier(n_estimators = 200, max_depth = 8, max_features = 'auto')
basicmodel = basicmodel.fit(basictrain, train["Label"])
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))

0.49206349206349204


2. TfidfVectorizer(single words) + Random Forest(n_estimators = 700, max_depth = 5)

In [None]:
param_grid = {"tfidfvectorizer__ngram_range": [(1,1), (1,2), (1,3), (2,2), (2,3)],
              "tfidfvectorizer__analyzer": ['char', 'word'],
              "randomforestclassifier__n_estimators": [200, 500, 700],
              "randomforestclassifier__max_features": ['auto', 'sqrt', 'log2'],
              "randomforestclassifier__max_depth": [4, 5, 6, 7, 8]
             }
grid = GridSearchCV(make_pipeline(TfidfVectorizer(max_features = 200000), RandomForestClassifier(),
                                  memory="cache_folder"),
                    param_grid=param_grid, cv=5, scoring = 'accuracy'
                   )
grid.fit(trainheadlines, train['Label'])
grid.best_params_

# results:
# {'randomforestclassifier__max_depth': 5,
#  'randomforestclassifier__max_features': 'auto',
#  'randomforestclassifier__n_estimators': 700,
#  'tfidfvectorizer__analyzer': 'char',
#  'tfidfvectorizer__ngram_range': (1, 1)}

In [22]:
basicvectorizer = TfidfVectorizer(ngram_range = (1, 1), max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = RandomForestClassifier(n_estimators = 700, max_depth = 5)
basicmodel = basicmodel.fit(basictrain, train["Label"])
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))

0.49206349206349204


### Neural Network

1. CountVectorizer(one or two connected words) + Neural Network()

In [None]:
def one_hot_encoder(array):
    encoder = LabelEncoder()
    encoder.fit(array)
    coded_array = encoder.transform(array)
    n = len(coded_array)
    n_labels = len(np.unique(coded_array))
    one_hot = np.zeros((n,n_labels))
    one_hot[np.arange(n), coded_array] = 1
    return one_hot
param_grid = {"countvectorizer__ngram_range": [(1,1), (1,2), (1,3), (2,2), (2,3)],
              "countvectorizer__analyzer": ['char', 'char_wb', 'word'],
              "mlpclassifier__learning_rate": ['constant', 'invscaling', 'adaptive'],
              "mlpclassifier__solver": ['sgd', 'lbfgs', 'adam'],
              "mlpclassifier__activation": ['logistic', 'tanh', 'relu'],
              "mlpclassifier__hidden_layer_sizes": [(30,), (60,), (80,)],
              "mlpclassifier__max_iter": [500, 1000]
             }
grid = GridSearchCV(make_pipeline(CountVectorizer(max_features = 200000), MLPClassifier(),
                                  memory="cache_folder"),
                    param_grid=param_grid, cv=5, scoring = 'accuracy'
                   )
grid.fit(trainheadlines, one_hot_encoder(train['Label']))
grid.best_params_

# result:
# {"countvectorizer__analyzer": 'word',
#  "countvectorizer__ngram_range": (1, 2),
#  "mlpclassifier__activation": 'logistic',
#  "mlpclassifier__hidden_layer_sizes": (60,)
#  "mlpclassifier__learning_rate": 'adaptive',
#  "mlpclassifier__max_iter": 500,
#  "mlpclassifier__solver": 'lbfgs'}

In [25]:
basicvectorizer = CountVectorizer(ngram_range = (1, 2), max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = MLPClassifier(solver='lbfgs', activation = 'logistic', hidden_layer_sizes=(60,), max_iter = 500, learning_rate = 'adaptive')
basicmodel = basicmodel.fit(basictrain, one_hot_encoder(train["Label"]))
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))

0.4973544973544973


2. TfidfVectorizer(two and three connected words) + Neural Network()

In [None]:
def one_hot_encoder(array):
    encoder = LabelEncoder()
    encoder.fit(array)
    coded_array = encoder.transform(array)
    n = len(coded_array)
    n_labels = len(np.unique(coded_array))
    one_hot = np.zeros((n,n_labels))
    one_hot[np.arange(n), coded_array] = 1
    return one_hot
param_grid = {"countvectorizer__ngram_range": [(1,1), (1,2), (1,3), (2,2), (2,3)],
              "countvectorizer__analyzer": ['char', 'char_wb', 'word'],
              "mlpclassifier__learning_rate": ['constant', 'invscaling', 'adaptive'],
              "mlpclassifier__solver": ['sgd', 'lbfgs', 'adam'],
              "mlpclassifier__activation": ['logistic', 'tanh', 'relu'],
              "mlpclassifier__hidden_layer_sizes": [(30,), (60,), (80,)],
              "mlpclassifier__max_iter": [500, 1000]
             }
grid = GridSearchCV(make_pipeline(CountVectorizer(max_features = 200000), MLPClassifier(),
                                  memory="cache_folder"),
                    param_grid=param_grid, cv=5, scoring = 'accuracy'
                   )
grid.fit(trainheadlines, one_hot_encoder(train['Label']))
grid.best_params_

# result:
# {"countvectorizer__analyzer": 'word',
#  "countvectorizer__ngram_range": (2, 3),
#  "mlpclassifier__activation": 'relu',
#  "mlpclassifier__hidden_layer_sizes": (30,)
#  "mlpclassifier__learning_rate": 'constant',
#  "mlpclassifier__max_iter": 500,
#  "mlpclassifier__solver": 'lbfgs'}

In [26]:
basicvectorizer = TfidfVectorizer(ngram_range = (2, 3), max_features = 200000)
basictrain = basicvectorizer.fit_transform(trainheadlines)
basicmodel = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30,), max_iter = 500)
basicmodel = basicmodel.fit(basictrain, one_hot_encoder(train["Label"]))
basictest = basicvectorizer.transform(testheadlines)
predictions = np.where(basicmodel.predict_proba(basictest)[:,1] > THRESHOLD, 1, 0)
print(accuracy_score(test['Label'], predictions))

0.5608465608465608
