In [None]:
import pandas as pd
from textblob import TextBlob

In [None]:
def getSubjectivity(text):
    try:
        return TextBlob(text).sentiment.subjectivity
    except:
        return None

In [None]:
def getPolarity(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

In [None]:
sentence_df = pd.read_csv('input/Combined_News_DJIA.csv')

sentence_df['Combined'] = sentence_df['Top1']

for i in range(2, 26):
    sentence_df['Combined'] += sentence_df['Top' + str(i)]
    


for index, sentence in sentence_df['Combined'].iteritems():
    if sentence:
        sentence_df.at[index, 'Subjectivity'] = getSubjectivity(sentence)
        sentence_df.at[index, 'Polarity'] = getPolarity(sentence)

In [None]:
sentence_df.head()

In [None]:
print(sentence_df.shape)
print(sentence_df.dtypes)

In [None]:
tmp_list = ['Date', 'Label', 'Subjectivity', 'Polarity']

my_dataframe = sentence_df[tmp_list]
print(my_dataframe.shape)
my_dataframe.head()

In [None]:
cols = list(my_dataframe)
print(cols)
cols.append(cols.pop(cols.index('Label')))
my_dataframe = my_dataframe.ix[:, cols]
my_dataframe.head()

In [None]:
print(cols)
my_dataframe.index = my_dataframe.index.sort_values()
my_dataframe.head()

In [None]:
print(my_dataframe.describe())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

In [None]:
from datetime import date
print(my_dataframe.head())

train_size = int(len(my_dataframe.loc[(pd.to_datetime(my_dataframe["Date"]) <= date(2014,12,31)), :]))
print(train_size)

In [None]:
print(my_dataframe.dtypes)
print(my_dataframe.count())
nan_list = ['Subjectivity', 'Polarity']

for col in nan_list:
    my_dataframe[col] = my_dataframe[col].fillna(my_dataframe[col].mean())

print(my_dataframe.count())
X = my_dataframe.loc[:,'Subjectivity':'Polarity']
y = my_dataframe.loc[:,'Label']
validation_size = 0.20
train_size = int(len(X.index) * 0.7)

print(len(y))
print(train_size)
X_train, X_test = X.loc[0:train_size, :], X.loc[train_size: len(X.index), :]
y_train, y_test = y[0:train_size+1], y.loc[train_size: len(X.index)]
print('Observations: %d' % (len(X.index)))
print('X Training Observations: %d' % (len(X_train.index)))
print('X Testing Observations: %d' % (len(X_test.index)))
print('y Training Observations: %d' % (len(y_train)))
print('y Testing Observations: %d' % (len(y_test)))

num_folds = 10
scoring = 'accuracy'
models = []
models.append(('Logistic Regression' , LogisticRegression()))
models.append(('K-Neighbors Classifier' , KNeighborsClassifier()))
models.append(('Gaussian NB' , GaussianNB()))
models.append(('SVM' , SVC()))
models.append(('Random Forest Classifier' , RandomForestClassifier(n_estimators=50)))
models.append(('Gaussian Process Classifier', GaussianProcessClassifier(1.0 * RBF(1.0))))
models.append(('Decision Tree Classifier', DecisionTreeClassifier(max_depth=3)))
models.append(('Ada Boost Classifier', AdaBoostClassifier()))
models.append(('MLP Classifier',  MLPClassifier(alpha=1)))
models.append(('Quadratic Discriminant Analysis', QuadraticDiscriminantAnalysis()))
models.append(('Ridge Classifier', RidgeClassifier()))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error

In [None]:
results = []
names = []

for name, model in models:
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accu_score = accuracy_score(y_test, y_pred)
    print(name + ": " + str(accu_score))

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.ensemble import BaggingClassifier

lr = LogisticRegression()
knn = KNeighborsClassifier()
nb = GaussianNB()
svm = SVC()
rf = RandomForestClassifier(n_estimators=50)
dt =  DecisionTreeClassifier(max_depth=3)
ada = AdaBoostClassifier()
mlp = MLPClassifier(alpha=1)
qda = QuadraticDiscriminantAnalysis()
rc = RidgeClassifier()


clf_array = [lr, knn, nb, svm, rf, dt, ada, mlp, qda, rc]
for clf in clf_array:
    vanilla_scores = cross_val_score(clf, X_train, y_train, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, max_samples=0.4, max_features=2)
    bagging_scores = cross_val_score(bagging_clf, X_train, y_train, cv=10, n_jobs=-1)
    
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(clf.__class__.__name__, vanilla_scores.mean(), vanilla_scores.std()))
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(clf.__class__.__name__, bagging_scores.mean(), bagging_scores.std()))

In [None]:
from sklearn.ensemble import VotingClassifier

eclf_array = VotingClassifier(estimators=[('LogisticRegression', lr), ('KNeighborsClassifier', knn), ('GaussianNB', nb), ('SVC', svm), ('RandomForestClassifier', rf), ('DecisionTreeClassifier', dt), ('AdaBoostClassifier', ada), ('MLPClassifier', mlp), ('QuadraticDiscriminantAnalysis', qda), ('RidgeClassifier', rc)], voting='hard')
for clf, label in zip(clf_array + [eclf_array], ['LogisticRegression', 'KNeighborsClassifier', 'GaussianNB', 'SVC', 'RandomForestClassifier', 'DecisionTreeClassifier', 'AdaBoostClassifier', 'MLPClassifier', 'QuadraticDiscriminantAnalysis', 'RidgeClassifier', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
from mlxtend.classifier import EnsembleVoteClassifier

eclf = EnsembleVoteClassifier(clfs=clf_array, voting='hard')
labels = ['LogisticRegression', 'KNeighborsClassifier', 'GaussianNB', 'SVC', 'RandomForestClassifier', 'DecisionTreeClassifier', 'AdaBoostClassifier', 'MLPClassifier', 'QuadraticDiscriminantAnalysis', 'RidgeClassifier', 'Ensemble']
for clf, label in zip(clf_array + [eclf], labels):
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

In [None]:
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier

clf1 = RidgeClassifier()
clf3 = KNeighborsClassifier()
clf2 = RandomForestClassifier()
clf4 = LogisticRegression()
clf5 = GaussianNB()
clf6 = SVC()
clf8 =  DecisionTreeClassifier(max_depth=3)
clf9 = AdaBoostClassifier()
clf10 = MLPClassifier(alpha=1)
clf11 = QuadraticDiscriminantAnalysis()


lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5, clf6, clf8, clf9, clf10, clf11], 
                          meta_classifier=lr)

params = {'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 50],
          'meta-logisticregression__C': [0.1, 10.0]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=5,
                    refit=True)
grid.fit(X_train, y_train)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

In [None]:
import numpy as np

In [None]:
print(X_train.shape)
print(type(X_train))
print(X_train.head())
Xnpy = np.array(X_train)

In [None]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [None]:
train_y = to_categorical(y_train)

In [None]:
K.clear_session()

model = Sequential()
model.add(Dense(128, input_shape=(2, ), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
model.fit(Xnpy, train_y, epochs=100, verbose=1, validation_split=0.2)

In [None]:
testnpy = np.array(X_test)
test_y = to_categorical(y_test)

In [None]:
model.evaluate(testnpy, test_y)