In [44]:
from db_helper_functions import get_stock_news_from_db
import pandas as pd

In [45]:
df = get_stock_news_from_db("AAPL")

In [46]:
df = df.sort_values(by='date')

In [47]:
## The block used to find the positive and negative new

# df['date'] = pd.to_datetime(df['date'])

# start_date = '2022-01-10'  # Use your start date here
# end_date = '2022-02-01'    # Use your end date here

# # Convert your date strings to datetime if they aren't already
# start_date = pd.to_datetime(start_date)
# end_date = pd.to_datetime(end_date)

# # Filter the DataFrame
# filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]


In [48]:
negative_article = df.iloc[1]

In [49]:
raw_negative = negative_article['article']

In [50]:
positive_article = df[df['id']==3253].iloc[0]


In [51]:
raw_positive = positive_article['article']

## Section 1: Data Cleaning

### Spacy:

In [52]:
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

example_text = "Apple Inc. stock rose by 5% in early trading Thursday."
    
def clean(article:str) ->str:
    cleaned_text = []
    
    # Load the spaCy model
    nlp = spacy.load("en_core_web_sm")
    
    # Add financial-specific stopwords
    financial_stopwords = ['inc.', 'corp.', 'ltd.',  'company', 'market', 'stock']
    for word in financial_stopwords:
        STOP_WORDS.add(word)
    
    #doc = nlp(example_text)
    doc = nlp(article)
    
    # Remove basic stop words, financial stop words, punctuation, number, person name and organization name
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num and not token.ent_type_ == "PERSON" and not token.ent_type_ == "ORG": 
            # Lemmatization, converting token to lowercase
            cleaned_text.append(token.lemma_.lower())
        
    # OPTIONAL: We can use Entity Recognition to decide keep or discard certerain information
    # for ent in doc.ents:
    #     if ent.label_ in ['ORG', 'MONEY', 'PERCENT', 'DATE', 'FAC']:
    #         cleaned_text.append(ent.text)
            
    # cleaned article
    cleaned_text = " ".join(cleaned_text)
    
    return cleaned_text 

clean(example_text)

'rise early trading thursday'

In [53]:
cleaned_positive_article_spacy = clean(positive_article['article'])
cleaned_positive_article_spacy

'notwithstanding supply constraint analyst positive aapl december quarter performance   analyst overweight rating $ price target share cupertino likely post upside street forecast december quarter guide relatively line march quarter thank improve iphone production year end modest services outperformance say note hit low oct. share suggest december quarter upside largely price add analyst model december quarter revenue $ gross margin $ share earning management likely shed light active device iphone instal base analyst say product ship unit unit unit say analyst estimate total product revenue $ consensus estimate $ service revenue likely come $ consensus estimate $ say upside strong expect performance segment add give sustained presence management unlikely offer revenue guidance range march quarter instead likely provide commentary segment growth trend note related link apple $ cap say share underperform far january december outperformance analyst say expect release iphone se3 april year

In [54]:
cleaned_negative_article_spacy = clean(negative_article['article'])
cleaned_negative_article_spacy

'news poor performance chinese economy distressing signal foreign retailer manufacturer aapl \xa0 announcement wednesday reduce revenue expectation poor iphone sale china revenue projection estimate $ short previous estimate $ 89-$93 accord chief executive officer announcement cause fall percent wednesday evening percent thursday dow jones industrial average fall point news sale slump permeate particularly affect stock expose chinese like \xa0 de qualcomm qcom \xa0 \xa0 lose $ value reach capitalization $ october attribute weak sale china decrease traffic retail outlet country china economic slowdown spark conservative monetary policy enact central government president begin term sign tighten credit chinese bank prevent rapid growth trigger volatility effect drive consumption subsequently tax revenue beijing promise stimulus package ward current economic challenge china ongoing trade dispute united states press concern u.s. president publicly criticize american company manufacture good

### Manule Cleaning:

In [56]:
import re

def clean_special_characters(text):
    cleaned_text = text.replace("\\'", "'")
    
    # Replace non-breaking space with a regular space
    cleaned_text = cleaned_text.replace("\xa0", " ")

    return cleaned_text

def add_space_after_period(text):
    # Pattern to match a period followed directly by a letter (upper or lower case)
    pattern = r'\.([A-Za-z])'
    # Replacement pattern adds a space between the period and the letter
    replacement = r'. \1'
    # Replace occurrences of the pattern with the replacement pattern
    corrected_text = re.sub(pattern, replacement, text)
    return corrected_text

def manule_clean(text):
    text = clean_special_characters(text)
    text = add_space_after_period(text)
    
    return text

cleaned_negative_article_manule = manule_clean(negative_article['article'])
cleaned_positive_article_manule = manule_clean(positive_article['article'])


In [98]:
# Split the paragraph into sentences
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

negative_sentences = sent_tokenize(cleaned_negative_article_manule)
df_negative = pd.DataFrame(negative_sentences, columns=['sentence'])

positive_sentences = sent_tokenize(cleaned_positive_article_manule)
df_positive = pd.DataFrame(positive_sentences, columns=['sentence'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Section 2: Get Sentiment Score by using Lexicon-Based Approach with cleaned text

### FinVader:

In [111]:
# Get the sentiment score by feeding in the sentences

from finvader import finvader

df_negative['finvader_score'] = df_negative.sentence.apply(finvader,
                                   use_sentibignomics = True, 
                                   use_henry = True, 
                                   indicator="compound")

df_positive['finvader_score'] = df_positive.sentence.apply(finvader,
                                   use_sentibignomics = True, 
                                   use_henry = True, 
                                   indicator="compound")

# Only keep the sentence contains apple
df_positive = df_positive[df_positive['sentence'].str.contains('apple|Apple|APPL', case=False, regex=True)]
df_negative = df_negative[df_negative['sentence'].str.contains('apple|Apple|APPL', case=False, regex=True)]

# Ignore score lower than 0.1

df_positive = df_positive[abs(df_positive['finvader_score']) > 0.1]
df_negative = df_negative[abs(df_negative['finvader_score']) > 0.1]


negative_score = df_negative['finvader_score'].mean()
positive_score = df_positive['finvader_score'].mean()

print(f'The score for the positive article is: {positive_score}')
print(f'The score for the negative article is {negative_score}')
df_negative


The score for the positive article is: 0.217175
The score for the negative article is 0.07085555555555555


Unnamed: 0,sentence,finvader_score,afinn_score
2,Apple's announcement caused its stock to fall ...,-0.3612,0.0
3,The Dow Jones Industrial Average fell 660 poin...,-0.674,1.0
4,Apple itself has lost $300 billion in stock va...,-0.3071,-2.0
10,China's ongoing trade dispute with the United ...,0.4118,-1.0
13,"""Apple prices may increase because of the mass...",-0.1216,0.0
27,China is now the world's largest market for sm...,0.6162,0.0
29,Apple's revenue derived from sales in China re...,0.1431,1.0
30,"Since 2016, Apple sales in China have retreate...",0.3798,1.0
35,Apple has not announced plans to build factori...,0.5507,2.0


In [112]:
# Get the sentiment socre by feeding in the whole article

positive_score = finvader(cleaned_positive_article_spacy, 
                  use_sentibignomics = True, 
                  use_henry = True, 
                  indicator = 'compound')

negative_score = finvader(cleaned_negative_article_spacy, 
                  use_sentibignomics = True, 
                  use_henry = True, 
                  indicator = 'compound' )

print(f'The score for the positive article is: {positive_score}')
print(f'The score for the negative article is {negative_score}')

The score for the positive article is: 0.826
The score for the negative article is 0.9787


In [113]:
# Test if FinVader clean organization name inside the modle

text_with_org = "Apple Inc. stock rose by 5% in early trading Thursday."
text_without_org = "stock rose by 5% in early trading Thursday."

score_with_org = finvader(text_with_org, 
                  use_sentibignomics = True, 
                  use_henry = True, 
                  indicator = 'compound')

score_without_org = finvader(text_with_org, 
                  use_sentibignomics = True, 
                  use_henry = True, 
                  indicator = 'compound')

print(f'The score with Organization name: {score_with_org}')
print(f'The score without Organization name: {score_without_org}')

The score with Organization name: 0.3653
The score without Organization name: 0.3653


### AFINN:

<b>The AFINN lexicon is a list of English words rated for valence with an integer between minus five (negative) and plus five (positive). The ratings are a simple measure of sentiment strength of words. It's commonly used in sentiment analysis to assess the positivity or negativity of texts, such as stock market news articles, to gauge the sentiment around a particular stock or the market in general.
    
AFINN typically employs a "bag of words" approach for sentiment analysis. This means it analyzes the text based on the presence and frequency of words that have been pre-assigned sentiment scores, without considering the order of words, grammar, or any potential contextual meaning that might arise from their sequence. Each word is treated independently, and its score is added to the total sentiment score of the text.

In [114]:
from afinn import Afinn

afinn = Afinn()

# Feed in uncleaned raw article
unclean_raw_positve_score = afinn.score(positive_article['article'])
unclean_raw_negative_score = afinn.score(negative_article['article'])
print(f'The score for Uncleaned Raw Positive article is: {unclean_raw_positve_score}')
print(f'The score for Uncleaned Raw Negative article is: {unclean_raw_negative_score}')

# Feed in cleaned full article
clean_full_positve_score = afinn.score(cleaned_positive_article_spacy)
clean_full_negative_score = afinn.score(cleaned_negative_article_spacy)
print(f'The score for Cleaned Full Positive article is: {clean_full_positve_score}')
print(f'The score for Cleaned Full Negative article is: {clean_full_negative_score}')

# Feed in uncleaned sentences
df_negative['afinn_score'] = df_negative.sentence.apply(afinn.score)
df_positive['afinn_score'] = df_positive.sentence.apply(afinn.score)
print(f"The score for Uncleaned Sentence-Averaged Positive article is: {df_positive['afinn_score'].mean()}")
print(f"The score for Uncleaned Sentence-Averaged Negative article is: {df_negative['afinn_score'].mean()}")


The score for Uncleaned Raw Positive article is: 19.0
The score for Uncleaned Raw Negative article is: -11.0
The score for Cleaned Full Positive article is: 16.0
The score for Cleaned Full Negative article is: -3.0
The score for Uncleaned Sentence-Averaged Positive article is: 1.875
The score for Uncleaned Sentence-Averaged Negative article is: 0.2222222222222222


### SentiWordNet:

I decide not to try out this model because the initial investigation has shown that the SentiWordNet also use the 'bag of the word' approach. his approach considers the sentiment values of individual words or terms in the text without accounting for their order, context, or the structure of the sentence in which they appear. It would be very similar to AFINN

## Section 3: Get Sentiment Score by using Lexicon-Based Approach with LDA 

### Apply LDA and associate topic to corresponding sentences

In [128]:
import nltk
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    # Tokenize into words
    words = word_tokenize(text)
    # Lowercase, remove stopwords and non-alpha characters, and lemmatize
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words and word.isalpha()]
    return words


# Tokenize the article into sentences
sentences = sent_tokenize(cleaned_negative_article_manule)

# Preprocess sentences
processed_sentences = [preprocess(sentence) for sentence in sentences]

# Step 2: Prepare the dictionary and corpus
dictionary = corpora.Dictionary(processed_sentences)
corpus = [dictionary.doc2bow(text) for text in processed_sentences]

# Step 3: Apply LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Step 4: Extract topics and associated sentences
topics_sentences = {}
for i, row_list in enumerate(lda_model[corpus]):
    row = row_list[0] if lda_model.per_word_topics else row_list            
    # Sort the topics by the contribution (second element in tuple)
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    # Get the dominant topic, its percentage contribution, and keywords
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  # => dominant topic
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            # Save the dominant topic and the sentence
            if topic_num not in topics_sentences:
                topics_sentences[topic_num] = [(prop_topic, sentences[i], topic_keywords)]
            else:
                topics_sentences[topic_num].append((prop_topic, sentences[i], topic_keywords))

# Displaying the topics and associated sentences
for topic_num, sentences_info in topics_sentences.items():
    print(f"Topic {topic_num}:\n")
    for info in sentences_info:
        print(f"Sentence: {info[1]}\nContribution: {info[0]}\nKeywords: {info[2]}\n")
    print("\n")

Topic 1:

Sentence: Among news of poor performance in the Chinese economy, one of the most distressing signals for foreign retailers and manufacturers is Apple's AAPL announcement on Wednesday that the company had reduced revenue expectations due to poor iPhone sales in China.
Contribution: 0.9689568281173706
Keywords: apple, china, percent, company, sale, market, new, foreign, domestic, stock

Sentence: Apple's announcement caused its stock to fall seven percent Wednesday evening and 10 percent on Thursday.
Contribution: 0.9330894947052002
Keywords: apple, china, percent, company, sale, market, new, foreign, domestic, stock

Sentence: Apple itself has lost $300 billion in stock value since becoming the first company to reach a market capitalization of $1 trillion in October 2018.
Contribution: 0.9463534355163574
Keywords: apple, china, percent, company, sale, market, new, foreign, domestic, stock

Sentence: U. S. President Donald Trump has publicly criticized Apple and other American 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Calculate Sementic Score based on topics:

In [129]:
# Without filtering
article_score = 0
topic_count = 0

for topic_num, sentences_info in topics_sentences.items():
    print(f"Topic {topic_num}:\n")
    topic_score = 0
    sentence_count = 0
    for info in sentences_info:
        sentence = info[1]
        lower_sentence = sentence.lower()
        
        sen_score = finvader(lower_sentence, 
                  use_sentibignomics = True, 
                  use_henry = True, 
                  indicator = 'compound')
        topic_score += sen_score
        sentence_count += 1
        print(f"Sentence: {lower_sentence}\nScore: {sen_score}\n")
    topic_count += 1
    topic_score = topic_score / sentence_count
    print(f"topic score: {topic_score}")
    print("\n")
    article_score += topic_score

article_score = article_score / topic_count

print(f"article score: {article_score}")

Topic 1:

Sentence: among news of poor performance in the chinese economy, one of the most distressing signals for foreign retailers and manufacturers is apple's aapl announcement on wednesday that the company had reduced revenue expectations due to poor iphone sales in china.
Score: -0.0439

Sentence: apple's announcement caused its stock to fall seven percent wednesday evening and 10 percent on thursday.
Score: -0.3612

Sentence: apple itself has lost $300 billion in stock value since becoming the first company to reach a market capitalization of $1 trillion in october 2018.
Score: -0.3071

Sentence: u. s. president donald trump has publicly criticized apple and other american companies for manufacturing goods in china instead of the united states.
Score: 0.0823

Sentence: many tech companies have expressed disagreement with the unpredictability of trump's methods in trade negotiation with china.
Score: -0.0142

Sentence: "make your products in the united states instead of china.
Sco

In [130]:
# Filter out irrelevant sentence and lower score sentences

article_score = 0
topic_count = 0
word_list = ['apple','appl']

for topic_num, sentences_info in topics_sentences.items():
    print(f"Topic {topic_num}:\n")
    topic_score = 0
    sentence_count = 0
    for info in sentences_info:
        sentence = info[1]
        lower_sentence = sentence.lower()
        
        if any(word.lower() in lower_sentence for word in word_list):
            sen_score = finvader(lower_sentence, 
                      use_sentibignomics = True, 
                      use_henry = True, 
                      indicator = 'compound')
            
            if abs(sen_score) > 0.1:
                topic_score += sen_score
                sentence_count +=1
                print(f"Sentence: {lower_sentence}\nScore: {sen_score}\n")
    if sentence_count >= 1:
        topic_count += 1
        topic_score = topic_score / sentence_count
        print(f"topic score: {topic_score}")
        print("\n")
        article_score += topic_score

article_score = article_score / topic_count

print(f"article score: {article_score}")

    

    

Topic 1:

Sentence: apple's announcement caused its stock to fall seven percent wednesday evening and 10 percent on thursday.
Score: -0.3612

Sentence: apple itself has lost $300 billion in stock value since becoming the first company to reach a market capitalization of $1 trillion in october 2018.
Score: -0.3071

topic score: -0.33415


Topic 0:

Sentence: china's ongoing trade dispute with the united states is another pressing concern for apple.
Score: 0.4118

Sentence: apple has not announced plans to build factories in the united states, but it has committed to repatriating roughly $250 billion from its foreign holdings due to recent reductions in the corporate tax code.
Score: 0.5507

topic score: 0.48124999999999996


Topic 4:

Sentence: the dow jones industrial average fell 660 points as news of apple's sale slump permeated the market, particularly affecting stocks exposed to the chinese market like boeing & co. ba, deere & co. de, qualcomm, inc. qcom and tiffany & co. tif.
Scor

Problem: Soem sentences that discuss other topics will also be captured

What to try next: Give different topic different weight