# Single Stock Summary Generation
## Get Data

In [1]:
import os
import numpy as np
import pandas as pd
# Progress bar
from tqdm.notebook import tqdm as pbar
from tqdm.auto import tqdm as pdbar

# Pandas tqdm initialization
pdbar.pandas()

# Get rid of warnings 
import warnings

def set_index_no_warnings(df):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        warnings.simplefilter("ignore", category=FutureWarning)
        index = df.index.str.replace("—", "").str.replace("EST", "").str.strip()
        return pd.to_datetime(index, format="mixed").date


# Ticker
tick = "AAPL"

# Input ata directory
raw = "../news_raw"

# Load data
df = (
        pd.read_csv(
        os.path.join(raw, f"{tick.lower()}.csv"),
        index_col="Date",
        usecols=["Date", "Text"]
    )
    .rename(columns={"Text":"Article"})
    .query("index != 0 and index != '0'")
    .pipe(
        lambda df: df.set_index(
            set_index_no_warnings(df)
        )
    )
)

df

Unnamed: 0,Article
2024-01-25,"For Immediate Release\nChicago, IL – January 2..."
2024-02-03,Thanks to its lineup of incredibly popular har...
2024-02-03,"In this podcast, Motley Fool host Dylan Lewis ..."
2024-02-03,Shares in Advanced Micro Devices (NASDAQ: AMD)...
2024-02-03,Investing in the stock market is one of the mo...
...,...
2024-02-03,"In this podcast, Motley Fool host Dylan Lewis ..."
2024-02-03,Shares in Advanced Micro Devices (NASDAQ: AMD)...
2024-02-03,Investing in the stock market is one of the mo...
2024-02-03,"Since its price cratered 65% in 2022, Bitcoin ..."


## Summarization

In [2]:
# In the environment, run nltk.download("punkt") once if error is thrown
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# Summary length in sentences:
summary_len = 20

# Summarizer initialization
summarizers = {
    "Lsa_summary":LsaSummarizer(Stemmer("english")),
    "Luhn_summary":LuhnSummarizer(Stemmer("english")),
    "Textrank_summary":TextRankSummarizer(Stemmer("english")),
    "Lexrank_summary":LexRankSummarizer(Stemmer("english"))
}

def summarize(text, date):
    # Parse article
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    # Run through summarizers
    summaries = {name : bot(parser.document, summary_len) for name, bot in summarizers.items()}
    summaries["Date"] = date
    return summaries

In [None]:
# For parallel processing
from concurrent.futures import ThreadPoolExecutor

# Column names
summ_bots = ["Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"]

# Keep things from crashing
def helper(text, date):
    try:
        return summarize(text, date)
    except Exception:
        print("ERROR ERROR ERROR")
        print(Exception)
        return {name : None for name in summ_bots}  # fallback

# Parallelize processing
with ThreadPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(helper, df["Article"], df.index))

# Remove empty results
# Make dataframe of results
# Drop article column
cleaned = [result for result in results if result]
df = pd.concat([df.drop(columns=["Article"]), pd.DataFrame(cleaned).set_index("Date")], axis=1)

df

## Export

In [None]:
# Sort values
df.sort_index(ascending=False, inplace=True)

In [None]:
# Make output directory
out_dir = f"../{tick.lower()}_data"
os.makedirs(f"../{tick.lower()}_data", exist_ok=True)

In [None]:
# Export to csv
df.to_csv(os.path.join(out_dir, "summaries.csv"))