<h3><B>Preprocessing the data

In [55]:
import nltk 
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('wordnet')
 nltk.download('stopwords')
 nltk.download('vader_lexicon')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import TreebankWordTokenizer
from nrclex import NRCLex
from transformers import pipeline
ner_pipeline =pipeline('ner',grouped_entities=True)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884

Hardware accelerator e.g. GPU is av

In [56]:
import pandas as pd

str_data = ["Apple’s Challenge in China Rises With New Rival Phones and AI Delay"]
str_data[0]=str_data[0].lower()
df = pd.DataFrame(str_data, columns=['Text'])

# Define a function to map POS tags to WordNet tags
def get_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Initialize lemmatizer and tokenizer
lemmatizer = WordNetLemmatizer()
tokenizer = TreebankWordTokenizer()
stop_words = set(stopwords.words('english'))

# Apply lemmatization and stop words removal
df['Text'] = df['Text'].apply(
    lambda x: " ".join([
        lemmatizer.lemmatize(token, get_pos_tag(tag))
        for token, tag in nltk.pos_tag(tokenizer.tokenize(x))
        if token not in stop_words
    ]) if pd.notnull(x) else ""
)

print(df['Text'].head())

0    apple’s challenge china rise new rival phone a...
Name: Text, dtype: object


In [57]:
inputdf=pd.DataFrame(columns=['Vader_sentiment_score', 'Blob_polarity', 'BlobSubjectivity',
       'positive_word_count', 'negative_word_count', 'person_count',
       'organization_count', 'location_count', 'anger', 'anticipation',
       'disgust', 'fear', 'joy', 'sadness', 'trust', 'Ticker'])


<h3><b>VADER SENTIMENT SCORES

In [58]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer=SentimentIntensityAnalyzer()
def extratVaderFeatures(text):
    score=analyzer.polarity_scores(text)
    sentimentScore=score['compound']
    return sentimentScore
inputdf['Vader_sentiment_score']=df['Text'].apply(extratVaderFeatures)

<h3><b>TEXTBLOB SUBJECTIVITY AND POLARITY

In [59]:
from textblob import TextBlob
def extractTextBlobSubjectivity(text):
    blob=TextBlob(text)
    polarity=blob.sentiment.polarity
    subjectivity=blob.sentiment.subjectivity
    return polarity,subjectivity
inputdf['Blob_polarity'],inputdf['BlobSubjectivity']=zip(*df['Text'].apply(extractTextBlobSubjectivity))

<h3><b>POSITIVE AND NEGATIVE WORD COUNT

In [60]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer


sia = SentimentIntensityAnalyzer()

def posneg_vader(text):
    tokens = text.split()
    positive_count = sum(1 for word in tokens if sia.polarity_scores(word)['compound'] > 0.05)
    negative_count = sum(1 for word in tokens if sia.polarity_scores(word)['compound'] < -0.05)
    return positive_count, negative_count

inputdf['positive_word_count'], inputdf['negative_word_count'] = zip(*df['Text'].apply(posneg_vader))

print(inputdf[['positive_word_count', 'negative_word_count']].head())


   positive_word_count  negative_word_count
0                    1                    1


<h3><b>NAME ENTITY RECOGNITION

In [61]:

def ner__(text):
    entities=ner_pipeline(text)
    person_count=sum(1 for entity in entities if entity['entity_group']=='PER')
    organization_count = sum(1 for entity in entities if entity['entity_group'] == 'ORG')
    location_count = sum(1 for entity in entities if entity['entity_group'] == 'LOC')
    return person_count, organization_count, location_count

# Apply the NER function to your data
inputdf['person_count'], inputdf['organization_count'], inputdf['location_count'] = zip(*df['Text'].apply(ner__))

print(inputdf[['person_count', 'organization_count', 'location_count']].head())


   person_count  organization_count  location_count
0             0                   0               0


<h3><b>EMOTION SCORES

In [62]:
from nrclex import NRCLex
def nrc(text):
    emotion=NRCLex(text)
    scores=emotion.raw_emotion_scores
    return scores.get('anger',0),scores.get('anticipation',0),scores.get('disgust',0),scores.get('fear',0),scores.get('joy',0),scores.get('sadness',0),scores.get('trust',0)
inputdf['anger'],inputdf['anticipation'],inputdf['disgust'],inputdf['fear'],inputdf['joy'],inputdf['sadness'],inputdf['trust']= zip(*df['Text'].apply(nrc))


<h2><b>General news


Mapping of categories to numeric values:

AAPL: 0

AMZN: 1

MSFT: 2

NVDA: 3

TSLA: 4

<h3><b>COMPANY??

In [63]:
inputdf['Ticker']=1

<h4>Random forest classifications

In [64]:
import pickle
with open("general data models/rf_close_classifier.pkl", 'rb') as file:
    GRfc_classifier = pickle.load(file)

print("Random Forest model loaded from 'rf_close_classifier.pkl'.")


Random Forest model loaded from 'rf_close_classifier.pkl'.


In [65]:
probabilities = GRft_classifier.predict_proba(inputdf)

prob = probabilities[0,1]
print(prob)

0.5445931164921403


In [66]:
import pickle
with open("general data models/rf_trade_classifier.pkl", 'rb') as file:
    GRft_classifier = pickle.load(file)

print("Random Forest model loaded from 'rf_trade_classifier.pkl'.")


Random Forest model loaded from 'rf_trade_classifier.pkl'.


In [67]:
probabilities = GRft_classifier.predict_proba(inputdf)

prob = probabilities[0,1]
print(prob)

0.5445931164921403


<h4>Random forest regression

In [68]:
import pickle
with open("general data models/rf_trade_regressor.pkl", 'rb') as file:
    GRft_regression = pickle.load(file)

print("Random Forest model loaded from 'rf_trade_regressor.pkl'.")


Random Forest model loaded from 'rf_trade_regressor.pkl'.


In [69]:
probabilities = GRft_regression.predict(inputdf)

prob = probabilities[0]
print(prob)

2.080301958537959


In [70]:
import pickle
with open("general data models/rf_close_regressor.pkl", 'rb') as file:
    GRfc_regression = pickle.load(file)

print("Random Forest model loaded from 'rf_close_regressor.pkl'.")


Random Forest model loaded from 'rf_close_regressor.pkl'.


In [71]:
probabilities = GRfc_regression.predict(inputdf)

prob = probabilities[0]
print(prob)

-0.14506524471297275


<h4>XGBOOST classification

In [72]:
import pickle
with open("general data models/XGB_close_classifier.pkl", 'rb') as file:
    GXGBc_classifier = pickle.load(file)

print("XGB  model loaded from 'XGB_close_classifier.pkl'.")


XGB  model loaded from 'XGB_close_classifier.pkl'.


In [73]:
probabilities = GXGBc_classifier.predict_proba(inputdf)

prob = probabilities[0,1]
print(prob)

0.51822203


In [74]:
import pickle
with open("general data models/XGB_trade_classifier.pkl", 'rb') as file:
    GXGBt_classifier = pickle.load(file)

print("XGB model loaded from 'XGB_trade_classifier.pkl'.")


XGB model loaded from 'XGB_trade_classifier.pkl'.


In [75]:
probabilities = GXGBt_classifier.predict_proba(inputdf)

prob = probabilities[0,1]
print(prob)

0.53991616


<h3>XGBOOST regression

In [76]:
import pickle
with open("general data models/xgb_regressor_trade.pkl", 'rb') as file:
    GXGBt_regression = pickle.load(file)

print("XGB model loaded from 'xgb_regressor_trade.pkl'.")


XGB model loaded from 'xgb_regressor_trade.pkl'.


In [77]:
probabilities = GXGBt_regression.predict(inputdf)

prob = probabilities[0]
print(prob)

3.5952377


In [78]:
import pickle
with open("general data models/xgb_regressor_close.pkl", 'rb') as file:
    GXGBc_regression = pickle.load(file)

print("XGB model loaded from 'xgb_regressor_close.pkl'.")


XGB model loaded from 'xgb_regressor_close.pkl'.


In [79]:
probabilities = GXGBc_regression.predict(inputdf)

prob = probabilities[0]
print(prob)

0.099871956


In [80]:
import plotly.graph_objects as go


# Create the gauge chart
fig = go.Figure(go.Indicator(
    mode="gauge+number",
    value=prob,
    title={'text': "Market Sentiment"},
    gauge={
        'axis': {'range': [0, 1]},
        'bar': {'color': "darkblue"},
        'steps': [
            {'range': [0, 0.33], 'color': "red"},
            {'range': [0.33, 0.67], 'color': "yellow"},
            {'range': [0.67, 1], 'color': "green"}
        ],
        'threshold': {
            'line': {'color': "black", 'width': 4},
            'thickness': 0.75,
            'value': prob
        }
    }
))

fig.show()
