In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Doc2Vec

In [4]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
import string

In [6]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score,precision_score,recall_score,f1_score

In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kapsu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kapsu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kapsu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
news_data = pd.read_csv("Data/news_data.csv")

In [9]:
news_data.head()

Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url
0,company,1697909459,Humanoid robots face a major test with Amazon'...,123320327,https://techcrunch.com/wp-content/uploads/2023...,AMZN,Yahoo,Announced amid a deluge of news at this week’s...,https://finnhub.io/api/news?id=8cb32870bb15be8...
1,company,1697896800,"3 Stocks That Turned $1,000 Into $1.1 Million ...",123315781,,MNST,Yahoo,Consumer products have proven themselves to be...,https://finnhub.io/api/news?id=d75d610543b91f1...
2,company,1697896800,The U.S. Cities With the Most Cutting-Edge Tec...,123315249,https://s.yimg.com/ny/api/res/1.2/kLEZByvMTM7D...,WMT,Yahoo,The Seattle area has the highest proportion of...,https://finnhub.io/api/news?id=faca644c2f7e4e6...
3,company,1697895295,Dow Jones Futures: Market Point Break? Microso...,123315250,https://media.zenfs.com/en/ibd.com/ec8365e261a...,MSFT,Yahoo,"Microsoft, Meta and Google lead a massive wave...",https://finnhub.io/api/news?id=6c9b9cc8fb7f9ec...
4,company,1697890800,Rivian (NASDAQ:RIVN): Wall Street Loves This E...,123319798,,TSLA,TipRanks,Looking for stock market analysis and research...,https://finnhub.io/api/news?id=fe0b89552f14e89...


In [10]:
news_data.drop(columns=['category','image','url','id','source'], inplace=True)

In [11]:
news_data.columns

Index(['datetime', 'headline', 'related', 'summary'], dtype='object')

In [12]:
news_data.shape

(757, 4)

In [13]:
news_data.head()

Unnamed: 0,datetime,headline,related,summary
0,1697909459,Humanoid robots face a major test with Amazon'...,AMZN,Announced amid a deluge of news at this week’s...
1,1697896800,"3 Stocks That Turned $1,000 Into $1.1 Million ...",MNST,Consumer products have proven themselves to be...
2,1697896800,The U.S. Cities With the Most Cutting-Edge Tec...,WMT,The Seattle area has the highest proportion of...
3,1697895295,Dow Jones Futures: Market Point Break? Microso...,MSFT,"Microsoft, Meta and Google lead a massive wave..."
4,1697890800,Rivian (NASDAQ:RIVN): Wall Street Loves This E...,TSLA,Looking for stock market analysis and research...


In [14]:
print(news_data['datetime'].dtype)

int64


In [15]:
news_data['y'] = 0

In [16]:
from fetch_stock_data_for_date import StockData

In [17]:
sd = StockData()

In [18]:
errors = []

In [19]:
def get_stock_data(x):
    try:
        return sd.get_delta(x['datetime'], x['related'])
    except ValueError as ve:
        print(ve)
        errors.append(x)
        return None

In [20]:
news_data['y'] = news_data.apply(lambda x : get_stock_data(x),axis=1)

In [21]:
news_data['y'].describe()

count    757.000000
mean      -1.324373
std        5.709280
min     -136.899902
25%       -2.040001
50%       -0.550003
75%        0.190001
max        7.330002
Name: y, dtype: float64

In [22]:
news_data.head()

Unnamed: 0,datetime,headline,related,summary,y
0,1697909459,Humanoid robots face a major test with Amazon'...,AMZN,Announced amid a deluge of news at this week’s...,-2.880005
1,1697896800,"3 Stocks That Turned $1,000 Into $1.1 Million ...",MNST,Consumer products have proven themselves to be...,-0.420002
2,1697896800,The U.S. Cities With the Most Cutting-Edge Tec...,WMT,The Seattle area has the highest proportion of...,-2.070007
3,1697895295,Dow Jones Futures: Market Point Break? Microso...,MSFT,"Microsoft, Meta and Google lead a massive wave...",-5.049988
4,1697890800,Rivian (NASDAQ:RIVN): Wall Street Loves This E...,TSLA,Looking for stock market analysis and research...,-5.019989


In [23]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [24]:
def remove_stop_words_and_puncts(words):
    f1 = [w.lower() for w in words if w.lower() not in stop_words]
    f2 = [w for w in f1 if w not in string.punctuation]
    return f2

def process_text(text):
    words = word_tokenize(text)
    clean_words = remove_stop_words_and_puncts(words)
    clean_words = [w for w in clean_words if w.isalpha()]  # Remove non-alpha text
    lemmatized_words = [
        lemmatizer.lemmatize(w) for w in clean_words
    ]  # Lemmatization
    return lemmatized_words

def process_text_as_string(text):
    lemma_words = process_text(text)
    
    if len(lemma_words) >= 3:
        return " ".join(lemma_words)
    else:
        return None

In [29]:
modded = news_data['summary'].apply(lambda x : process_text_as_string(str(x)))

In [30]:
news_data['pt'] = modded

In [32]:
news_data = news_data[news_data['pt'].notna()]

In [33]:
# Initializing TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True)

In [34]:
# Fit and transform the processed text data
vectorized_data = vectorizer.fit_transform(news_data['summary'])

In [35]:
# # Convert the sparse matrix to an array for inspection (if needed)
# vectorized_data_array = vectorized_data.toarray()

In [37]:
# Example: Get the feature names
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['070', '0981', '10', ..., 'zero', 'zhou', 'zuckerberg'],
      dtype=object)

In [59]:
# Example: Print the vectorized data
vectorized_data

<724x3846 sparse matrix of type '<class 'numpy.float64'>'
	with 19563 stored elements in Compressed Sparse Row format>

In [61]:
vectorized_data.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
# Preparing test and train data sets
print(vectorized_data.shape)

(724, 3846)


In [40]:
labels = news_data['y'].to_numpy()

In [47]:
labels = labels > 0 

In [49]:
labels = labels.astype(int)

In [50]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,

In [51]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_data, labels, test_size=0.2, random_state=42)

In [52]:
print(X_train.shape)
print(y_train.shape)

(579, 3846)
(579,)


In [53]:
print(X_test.shape)
print(y_test.shape)

(145, 3846)
(145,)


In [54]:
# Initialize classifiers
classifiers = {
    'SVM_Linear': SVC(kernel='linear'),
    'SVM_poly': SVC(kernel='poly', degree=3),
    'SVM_rbf': SVC(kernel='rbf', C=1.0, gamma='scale'),
    'SVM_sigmoid': SVC(kernel='sigmoid'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Naive Bayes': MultinomialNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
}

In [57]:
# Train and evaluate each classifier
output_data = []
for clf_name, clf in classifiers.items():
    # Train the model on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)*100
    # precision = precision_score(y_test,predictions)*100
    # recall = recall_score(y_test,predictions)*100
    # f1 = f1_score(y_test,predictions) * 100
    print(f"{clf_name} Accuracy: {accuracy:.2f}%")
    # print(f"{clf_name} Precision: {precision:.2f}%")
    # print(f"{clf_name} Recall: {recall:.2f}%")
    # print(f"{clf_name} F1 score: {f1:.2f}%")

SVM_Linear Accuracy: 71.72%
SVM_poly Accuracy: 70.34%
SVM_rbf Accuracy: 70.34%
SVM_sigmoid Accuracy: 71.72%
Random Forest Accuracy: 69.66%
Naive Bayes Accuracy: 71.03%
K-Nearest Neighbors Accuracy: 73.10%
Logistic Regression Accuracy: 71.03%
Gradient Boosting Accuracy: 65.52%
