# 1. Preprocessing

Bron: https://www.kaggle.com/code/small34/nlp-final-tfidf

In [1]:
from functools import reduce

import re

from nltk.tokenize import word_tokenize
from nltk.corpus import words
from bs4 import BeautifulSoup
from tqdm import tqdm

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy

In [2]:
tqdm.pandas()

In [3]:
df_data = pd.read_csv("../data/aapl_us_equities_news_proc_data.csv")

In [4]:
nlp = spacy.load("en_core_web_sm")

## 1.1 Consolidate text

Concats every article and title together per day

In [5]:
def concat(series):
    return reduce(lambda x, y: x + " " + y, series)


# Group by date
df_data = df_data.groupby("date").agg({"title": concat, "content": concat, "target": "first"})

# Concat title and content
df_data["text"] = df_data["title"] + " " + df_data["content"]

# Remove index
df_data = df_data.reset_index()

# Select columns
df_data = df_data[["text", "target"]]

## 1.2 Remove HTML tags

Removes HTML tags using `beautifulsoup`

In [6]:
def remove_html_tags(text):
    """
    Remove html tags from text.
    """
    soup = BeautifulSoup(text, "html.parser")

    stripped_text = soup.get_text(separator=" ")

    return stripped_text


df_data["text"] = df_data.progress_apply(lambda row : remove_html_tags(row["text"]), axis = 1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1740/1740 [00:00<00:00, 4421.25it/s]


## 1.3 Remove EOL characters

Removes characters like: `\n`, `\t`, `\r`

In [7]:
def remove_eol_characters(text):
    """
    Remove EOL characters from text.
    """
    return re.sub(r"(\n|\t|\r)", "", text)


df_data["text"] = df_data.progress_apply(lambda row: remove_eol_characters(row["text"]), axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1740/1740 [00:00<00:00, 5505.00it/s]


## 1.4 Remove other characters

Removes all characters except: `a-z`, `A-Z`, `À-ȕ`

In [8]:
def remove_other_characters(text):
    """
    Remove other characters from text.
    """
    return re.sub(r"[^a-zA-ZÀ-ȕ]", " ", text)


df_data["text"] = df_data.progress_apply(lambda row: remove_other_characters(row["text"]), axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1740/1740 [00:02<00:00, 707.61it/s]


## 1.5 Remove excessive spaces

Removes excessive spaces in some documents

In [9]:
def remove_excessive_spaces(text):
    """
    Remove excessive spaces from text.
    """
    return re.sub(r"\s{2,}", " ", text)


df_data["text"] = df_data.progress_apply(lambda row: remove_excessive_spaces(row["text"]), axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1740/1740 [00:01<00:00, 1608.77it/s]


## 1.6 Convert to lowercase

Converts all characters to lowercase

In [10]:
def convert_to_lowercase(text):
    """
    Convert to lowercase.
    """
    return text.lower()


df_data["text"] = df_data.progress_apply(lambda row: convert_to_lowercase(row["text"]), axis=1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1740/1740 [00:00<00:00, 35206.47it/s]


## 1.7 Remove non-dictionary words, single-character words, stopwords and lemmatize

Removes words that do not occur in the english dictionary, words that consist of one character, words that are a stopword and lemmatizes the remaining words

In [11]:
dictionary = set(words.words())


def remove_non_dictionary_single_character_stopwords_lemmatize(text):
    """
    Remove non-dictionary words, single character words, stopwords and lemmatize
    text.
    """
    return " ".join(token.lemma_ for token in nlp(text) if len(token.text) > 1 and not token.is_stop and token.text in dictionary)


# df_data["text"] = df_data.progress_apply(lambda row: remove_non_dictionary_single_character_stopwords_lemmatize(row["text"]), axis=1)

## 1.8 Other

Bron: https://www.kaggle.com/code/small34/nlp-final-tfidf

In [12]:
def lemmatize(text):
    return " ".join(token.lemma_ for token in nlp(text))


df_data.iloc[0:5].progress_apply(lambda row: lemmatize(row["text"]), axis=1).iloc[0]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 10.31it/s]


'wait for direction on the market this stock market have be a difficult one for trader and investor alike the directionless choppiness have combine to create a climate of maximum frustration for many professional that I have talk to unfortunately the volatility be likely to continue and I be still wait for sign of direction the bear casefor a market analyst the bear case be easy to make cyclical leadership be weaken and defensive leadership continue to be dominant consider the chart below of the relative performance of consumer discretionary xly against the market spy this sector which have be the leader for close to two year have decline through a relative performance trendline which indicate falter relative strength the relative performance of the morgan stanley cyclical index tell an ugly story cyclical stock remain in a relative downtrend against the market with no bottom in sight if there be an economic rebound then it should show up in this sector this weakness be confirm by rece

In [13]:
from nltk.stem import WordNetLemmatizer


lemma = WordNetLemmatizer()


def lemmatize_pos(text):
    return " ".join([lemma.lemmatize(token.text, pos="n") if token.tag_.startswith("N") else
                     lemma.lemmatize(token.text, pos="v") if token.tag_.startswith("V") else
                     lemma.lemmatize(token.text, pos="a") if token.tag_.startswith("JJ") else
                     lemma.lemmatize(token.text, pos="r") if token.tag_.startswith("R") else
                     token.text for token in nlp(text)])


df_data.iloc[0:5].progress_apply(lambda row: lemmatize_pos(row["text"]), axis=1).iloc[0]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.20it/s]


'wait for direction on the market this stock market have be a difficult one for trader and investor alike the directionless choppiness have combine to create a climate of maximum frustration for many professional that i have talk to unfortunately the volatility be likely to continue and i be still wait for sign of direction the bear casefor a market analyst the bear case be easy to make cyclical leadership be weaken and defensive leadership continue to be dominant consider the chart below of the relative performance of consumer discretionary xly against the market spy this sector which have be the leader for close to two year have decline through a relative performance trendline which indicate falter relative strength the relative performance of the morgan stanley cyclical index tell an ugly story cyclical stock remain in a relative downtrend against the market with no bottom in sight if there be an economic rebound then it should show up in this sector this weakness be confirm by rece

In [14]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import PorterStemmer


stemmer = PorterStemmer()
lemma = WordNetLemmatizer()


def stemming_lemmatize_pos(text):
    return " ".join([lemma.lemmatize(token.text, pos="n") if token.tag_.startswith("N") else
                     lemma.lemmatize(token.text, pos="v") if token.tag_.startswith("V") else
                     lemma.lemmatize(token.text, pos="a") if token.tag_.startswith("JJ") else
                     lemma.lemmatize(token.text, pos="r") if token.tag_.startswith("R") else
                     token.text for token in nlp(stemmer.stem(text))])


df_data.iloc[0:5].progress_apply(lambda row: stemming_lemmatize_pos(row["text"]), axis=1).iloc[0]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 13.53it/s]


'wait for direction on the market this stock market have be a difficult one for trader and investor alike the directionless choppiness have combine to create a climate of maximum frustration for many professional that i have talk to unfortunately the volatility be likely to continue and i be still wait for sign of direction the bear casefor a market analyst the bear case be easy to make cyclical leadership be weaken and defensive leadership continue to be dominant consider the chart below of the relative performance of consumer discretionary xly against the market spy this sector which have be the leader for close to two year have decline through a relative performance trendline which indicate falter relative strength the relative performance of the morgan stanley cyclical index tell an ugly story cyclical stock remain in a relative downtrend against the market with no bottom in sight if there be an economic rebound then it should show up in this sector this weakness be confirm by rece