1. Import Necessary Packages 

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/teddy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

2. Make Semantic Analysise

In [3]:
data = pd.read_csv("../Data/raw_analyst_ratings.csv")

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to get sentiment from VADER
def get_sentiment_vader(text):
    score = sia.polarity_scores(text)
    if score['compound'] >= 0.05:
        return 'positive'
    elif score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to the 'headline' column
data['sentiment'] = data['headline'].apply(get_sentiment_vader)

# Print the result
print(data[['headline', 'sentiment']].head())


                                            headline sentiment
0            Stocks That Hit 52-Week Highs On Friday   neutral
1         Stocks That Hit 52-Week Highs On Wednesday   neutral
2                      71 Biggest Movers From Friday   neutral
3       46 Stocks Moving In Friday's Mid-Day Session   neutral
4  B of A Securities Maintains Neutral on Agilent...  positive


3. Identify common words from headline

In [4]:
#  preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

# Apply preprocessing
data['cleaned_headline'] = data['headline'].apply(preprocess_text)

In [6]:

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=100)

# Fit and transform the cleaned headlines
X_tfidf = tfidf_vectorizer.fit_transform(data['cleaned_headline'])

# Get feature names (keywords)
tfidf_keywords = tfidf_vectorizer.get_feature_names_out()

# Sum up the TF-IDF scores of each keyword
tfidf_scores = X_tfidf.sum(axis=0).A1
tfidf_keyword_df = pd.DataFrame({'keyword': tfidf_keywords, 'score': tfidf_scores})

# Sort by score
tfidf_keyword_df = tfidf_keyword_df.sort_values(by='score', ascending=False)

# Print the most significant keywords
print(tfidf_keyword_df.head(20))


       keyword         score
90      stocks  70003.945383
87      shares  51007.100031
96          vs  50848.869122
76     reports  50376.402194
25    earnings  49525.035651
51      market  47419.458995
94      update  45340.649320
28         est  43957.906077
7    announces  41100.923737
27         eps  41059.032021
11   benzingas  39428.083744
24  downgrades  34893.770513
13         buy  32758.885815
68          pt  31961.469730
95    upgrades  31365.657935
80        says  30855.928584
79       sales  30522.204646
59         new  27600.080579
73      raises  27146.600522
67       price  27079.916161
