# 1. Modeling

Bron: https://iq.opengenus.org/naive-bayes-on-tf-idf-vectorized-matrix/

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df_text = pd.read_csv("../data/aapl_us_equities_news_proc_text.csv")

## 1.1 Split data

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df_text["text"], df_text["target"], random_state=42)

## 1.2 Count Vectorizer

In [4]:
ct_vectorizer = CountVectorizer() # Mooiere verdeling, slechtere accuracy

### 1.2.1 Transform data

In [5]:
X_train_ct = ct_vectorizer.fit_transform(X_train)
X_test_ct = ct_vectorizer.transform(X_test)

### 1.2.2 Train classifier

In [6]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_ct, y_train)

### 1.2.3 Evaluate classifier

In [7]:
y_test_pred = naive_bayes_classifier.predict(X_test_ct)

In [8]:
print("Accuracy: %0.3f" % metrics.accuracy_score(y_test, y_test_pred))

Accuracy: 0.446


In [9]:
print("Report:")
print(metrics.classification_report(y_test, y_test_pred, target_names = ["Positive", "Negative"]))

Report:
              precision    recall  f1-score   support

    Positive       0.44      0.83      0.57       196
    Negative       0.48      0.13      0.20       239

    accuracy                           0.45       435
   macro avg       0.46      0.48      0.39       435
weighted avg       0.46      0.45      0.37       435



In [10]:
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_test_pred))

Confusion Matrix:
[[163  33]
 [208  31]]


## 1.3 TF-IDF Vectorizer

In [11]:
tf_vectorizer = TfidfVectorizer() # Alles wordt negatief predict

### 1.3.1 Transform data

In [12]:
X_train_tf = tf_vectorizer.fit_transform(X_train)
X_test_tf = tf_vectorizer.transform(X_test)

### 1.3.2 Train classifier

In [13]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, y_train)

### 1.3.3 Evaluate classifier

In [14]:
y_test_pred = naive_bayes_classifier.predict(X_test_tf)

In [15]:
print("Accuracy: %0.3f" % metrics.accuracy_score(y_test, y_test_pred))

Accuracy: 0.547


In [16]:
print("Report:")
print(metrics.classification_report(y_test, y_test_pred, target_names = ["Positive", "Negative"]))

Report:
              precision    recall  f1-score   support

    Positive       0.48      0.06      0.10       196
    Negative       0.55      0.95      0.70       239

    accuracy                           0.55       435
   macro avg       0.51      0.50      0.40       435
weighted avg       0.52      0.55      0.43       435



In [17]:
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_test_pred))

Confusion Matrix:
[[ 11 185]
 [ 12 227]]


### 1.3.4 Important words

In [18]:
# print("Woorden", tf_vectorizer.get_feature_names_out())

feature_array = np.array(tf_vectorizer.get_feature_names_out())
tf_sorting = np.argsort(X_test_tf.toarray()).flatten()[::-1]

n = 20
top_n = feature_array[tf_sorting][:n]

print("Important Words: ", top_n)

Important Words:  ['apple' 'charge' 'say' 'electric' 'chip' 'new' 'hike' 'generation' 'high'
 'eu' 'tensor' 'company' 'fund' 'home' 'saft' 'wave' 'advanced'
 'intangible' 'datum' 'rate']
