In [None]:
# CA 02
# Module: B9DA103 Data Mining
# Group B
# Subramaniam Kazhuparambil (10524303)
# Rahul Ramchandra Uppari (10523807)
# Mukund Bulchandani (10525778)

In [1]:
# Libraries used

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# Importing the news headlines data

news_data = pd.read_csv("Combined_News_DJIA.csv")

print(news_data.shape)
print(news_data.head())

(1989, 27)
         Date  Label                                               Top1  \
0  2008-08-08      0  b"Georgia 'downs two Russian warplanes' as cou...   
1  2008-08-11      1  b'Why wont America and Nato help us? If they w...   
2  2008-08-12      0  b'Remember that adorable 9-year-old who sang a...   
3  2008-08-13      0  b' U.S. refuses Israel weapons to attack Iran:...   
4  2008-08-14      1  b'All the experts admit that we should legalis...   

                                                Top2  \
0            b'BREAKING: Musharraf to be impeached.'   
1        b'Bush puts foot down on Georgian conflict'   
2                 b"Russia 'ends Georgia operation'"   
3  b"When the president ordered to attack Tskhinv...   
4  b'War in South Osetia - 89 pictures made by a ...   

                                                Top3  \
0  b'Russia Today: Columns of troops roll into So...   
1  b"Jewish Georgian minister: Thanks to Israeli ...   
2  b'"If we had no sexual harassm

In [3]:
# Combining all the headline columns into a single attribute "merged_news"

news_data["merged_news"] = news_data.filter(regex = ("Top.*")).apply(lambda x: ''.join(str(x.values)), axis = 1)
print(news_data.head())

         Date  Label                                               Top1  \
0  2008-08-08      0  b"Georgia 'downs two Russian warplanes' as cou...   
1  2008-08-11      1  b'Why wont America and Nato help us? If they w...   
2  2008-08-12      0  b'Remember that adorable 9-year-old who sang a...   
3  2008-08-13      0  b' U.S. refuses Israel weapons to attack Iran:...   
4  2008-08-14      1  b'All the experts admit that we should legalis...   

                                                Top2  \
0            b'BREAKING: Musharraf to be impeached.'   
1        b'Bush puts foot down on Georgian conflict'   
2                 b"Russia 'ends Georgia operation'"   
3  b"When the president ordered to attack Tskhinv...   
4  b'War in South Osetia - 89 pictures made by a ...   

                                                Top3  \
0  b'Russia Today: Columns of troops roll into So...   
1  b"Jewish Georgian minister: Thanks to Israeli ...   
2  b'"If we had no sexual harassment we woul

In [4]:
# Converting the news headlines into a Bag-of-Words model using TF-IDF Vectorizer

tfidf_vec = TfidfVectorizer(stop_words = "english", ngram_range = (1, 2), use_idf = True)
X = tfidf_vec.fit_transform(news_data["merged_news"].values)

In [5]:
# Importing the stock data

stock_data = pd.read_csv("upload_DJIA_table.csv")

stock_data = stock_data.iloc[::-1]
stock_data = stock_data.reset_index(drop = True)

feature_set = hstack([X, stock_data.iloc[:, 1:4].values])

In [6]:
# Sampling data using train_test_split

x_train, x_test, y_train, y_test = train_test_split(feature_set, news_data["Label"].values, test_size = 0.30, 
                                                    random_state = 8)

In [7]:
# Implementing K-Nearest Neighbors to predict the class label

neigh = KNeighborsClassifier(n_neighbors = 5)
neigh.fit(x_train, y_train)
predict_knn = neigh.predict(x_test)
knn_acc = accuracy_score(y_test, predict_knn)

print("Accuracy: " + str(knn_acc*100) + "%")

Accuracy: 73.70184254606366%


In [8]:
# Since KNN gave the best accuracy,
# using K-Fold Cross Validation to validate that the results were not obtained by chance

kfold_model = KFold(n_splits = 10, random_state = 7)
kfold_result = cross_val_score(neigh, feature_set, news_data["Label"].values, cv = kfold_model)
print("Accuracy: " + str(kfold_result.mean()*100.0))



Accuracy: 70.19313740419267


In [9]:
# Implementing Support Vector Machine to predict the class label

svm_model = SVC()
svm_model.fit(x_train, y_train)
predict_svm = svm_model.predict(x_test)
svm_acc = accuracy_score(y_test, predict_svm)

print("Accuracy: " + str(svm_acc*100) + "%")

Accuracy: 55.778894472361806%


In [10]:
# Implementing Decision Tree to predict the class label

dtree = tree.DecisionTreeClassifier()
dtree.fit(x_train, y_train)
predict_dtree = dtree.predict(x_test)
dtree_acc = accuracy_score(y_test, predict_dtree)

print("Accuracy: " + str(dtree_acc*100) + "%")

Accuracy: 49.246231155778894%


In [11]:
# Implementing Random Forest to predict the class label

rf = RandomForestClassifier(n_estimators = 1000)
rf.fit(x_train, y_train)
predict_rf = rf.predict(x_test)
rf_acc = accuracy_score(y_test, predict_rf)

print("Accuracy: " + str(rf_acc*100) + "%")

Accuracy: 52.26130653266332%


In [12]:
# Results observed

results = []
results.append(("KNN", knn_acc))
results.append(("SVM", svm_acc))
results.append(("CART", dtree_acc))
results.append(("RF", rf_acc))

In [13]:
print(results)

[('KNN', 0.7370184254606366), ('SVM', 0.5577889447236181), ('CART', 0.49246231155778897), ('RF', 0.5226130653266332)]
