# 1. Data Marking

In [1]:
import re

from IPython.display import clear_output

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
STOCK_NEWS_DATASET_FILE = "../data/aapl_us_equities_news.csv"

TEXT_BOLD = "\033[1m"
TEXT_HIGHLIGHT = "\033[33m\033[40m"
TEXT_END = "\033[0m"

## 1.1 Utility functions

In [3]:
def filter_documents(documents, expressions):
    expression = "|".join(expressions)

    return documents[
        documents["title"].str.contains(expression, regex=True)
        | documents["content"].str.contains(expression, regex=True)
    ]


def highlight_text(text, expressions):
    highlighted_text = text

    for expression in expressions:
        highlighted_text = re.sub(expression, TEXT_HIGHLIGHT + "\\1" + TEXT_END, highlighted_text)

    return highlighted_text


def ask_question(question):
    answer = input(question).strip()

    if answer not in ["y", "Y", "n", "N"]:
        ask_question(question)

    return answer in ["y", "Y"]


def mark_documents(documents, expressions):
    filtered_documents = filter_documents(documents, expressions)
    total_documents = len(filtered_documents)
    current_document = 1

    marked_indices = []

    for idx, document in filtered_documents.iterrows():
        highlighted_title = highlight_text(document.title, expressions)
        hightighted_content = highlight_text(document.content, expressions)
        
        print(TEXT_BOLD + "Progress: " + TEXT_END + f"{current_document}/{total_documents}")
        print("")
        print(TEXT_BOLD + "Id: " + TEXT_END + f"{document.id}")
        print(TEXT_BOLD + "Index: " + TEXT_END + f"{idx}")
        print("-" * 20)
        print(TEXT_BOLD + "Title: " + TEXT_END)
        print(highlighted_title)
        print("")
        print(TEXT_BOLD + "Content: " + TEXT_END)
        print(hightighted_content)
        print("-" * 20)

        answer = ask_question("Mark this document? [y/n]: ")
        if answer:
            marked_indices.append(idx)

        current_document += 1
        clear_output()

    return filtered_documents.loc[marked_indices]

## 1.2 Load data

In [4]:
df_aapl_us_equities_news = pd.read_csv(STOCK_NEWS_DATASET_FILE)

## 1.3 Subset data

In [5]:
df_aapl_us_equities_news = df_aapl_us_equities_news[df_aapl_us_equities_news["ticker"] == "AAPL"]

## 1.4 Mark data

In [6]:
# NOTE:
# The documents will automatically be filtered using these regular expressions,
# checking both the title and the content for a match. Each regular expression
# must be wrapped in a capture group to allow the mark function to highlight
# the relevant words. Documents are matched using contains this means that as
# long as one of the expressions matches on at least one word, the document is
# included in the "to review" selection.
#
# The mark function will prompt you to include a document in the "marked"
# selection [y] or continue with the next one [n]. After going through all the
# documents in the through review selection a dataframe with the "marked"
# selection is returned.
#
# Capture group: (...)
#
# Expressions:
#
#   * ([Ff]ruit): matches all documents that contain "fruit" or "Fruit".
#
#   * (apple(?!\W+AAPL)): matches all documents that contain "apple" without
#     "AAPL" behind, so "apple       AAPL" is not matched.
#
expressions = [
    r"([Ff]ruit)",
    r"(apple(?!\W+AAPL))",
]

marked_documents = mark_documents(df_aapl_us_equities_news, expressions)

  documents["title"].str.contains(expression, regex=True)
  | documents["content"].str.contains(expression, regex=True)


[1mProgress: [0m1/258

[1mId: [0m290746
[1mIndex: [0m11248
--------------------
[1mTitle: [0m
Apple  Institutions Locking Their Profit  Should You Worry 

[1mContent: [0m
Tech Sector giant is facing huge pressure from bears who are selling the stock as there is no tomorrow  However according to Morgan Stanley  MS  analysts they are not buying into [33m[40mapple[0m bear trend will continue much longer because company s issues in relation to its product supplies and as well as margins will be sorted soon According to Wall Street journal s report Google s  GOOG  new mapping application will work on [33m[40mapple[0m I phones  ipads  and Google is planning to file this to iTunes store Company has paid its second dividend payment of  2 5 per share or  2 5 billion dollar for its 935 million outstanding shares However  as Apple   AAPL  is the most widely held stock by institutions and is part of over 800 hedge funds so it is believed that recent sell off is more of locking in p

KeyboardInterrupt: Interrupted by user

## 1.5 Show data

In [None]:
marked_documents