In [16]:
import requests
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Set up your NewsAPI key
api_key = ''

# Define the list of stock symbols and corresponding company names
stock_info = {
    'AAPL': 'Apple',
    'GOOGL': 'Alphabet',
    'AXON': 'Axon Enterprise',
    'BA': 'Boeing',
    'GD': 'General Dynamics',
    'NEM': 'Newmont Goldcorp',
    'FNV': 'Franco-Nevada',
    'GOLD': 'Barrick Gold',
    'BAC': 'Bank of America',
    'JPM': 'JPMorgan Chase',
    'BMO': 'Bank of Montreal',
    'GS': 'Goldman Sachs',
    'MS': 'Morgan Stanley',
    'SCHW': 'Charles Schwab',
    'NVO': 'Novo Nordisk',
    'REGN': 'Regeneron Pharmaceuticals',
    'VRTX': 'Vertex Pharmaceuticals',
    'ANET': 'Arista Networks',
    'DELL': 'Dell Technologies',
    'HP': 'HP Inc.',
    'IAU': 'iShares Gold Trust',
    'URE': 'ProShares Ultra Real Estate',
    'ELD': 'WisdomTree Emerging Markets Local Debt',
    'SSD': 'Simpson Manufacturing',
    'UFPI': 'Universal Forest Products',
    'WFG': 'West Fraser Timber',
    'BKNG': 'Booking Holdings',
    'TNL': 'Travel + Leisure',
    'RCL': 'Royal Caribbean Cruises',
    'NVDA': 'NVIDIA',
    'TSM': 'Taiwan Semiconductor Manufacturing',
    'AVGO': 'Broadcom'
}

# Define the API endpoint
endpoint = 'https://newsapi.org/v2/everything'

# Create a list to store English articles with symbols and sentiment scores
articles_with_sentiment = []

# Initialize sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Iterate through each stock symbol
for stock_symbol, stock_name in stock_info.items():
    params = {
        'q': stock_name,
        'language': 'en',
        'apiKey': api_key
    }

    # Make the API request
    response = requests.get(endpoint, params=params)
    data = response.json()

    # Check if the response has 'articles' key
    if 'articles' in data:
        articles = data['articles']

        # Process and add articles to the list
        for article in articles:
            if article['description'] is not None:  # Check for None description
                # Calculate sentiment score using VADER
                sentiment_scores = sid.polarity_scores(article['description'])
                sentiment = sentiment_scores['compound']

                # Append article data to the list
                articles_with_sentiment.append({
                        'Stock Name': stock_name,
                        'Sentiment': sentiment,
                        'Description': article['description']
                    })
    

# Create a DataFrame from the collected data
df = pd.DataFrame(articles_with_sentiment)

# Create a LabelEncoder to encode stock symbols into numerical labels
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

# Encode stock symbols into numerical labels
# df['Stock Name'] = le.fit_transform(df['Stock Name'])  # Add this line
df['Sentiment'] = df['Sentiment'].astype(float)  # Ensure sentiment values are float

# Split the data into features (X) and target (y)
# X = df[['Sentiment', 'Description']]
# y = df['Stock Name']

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
# vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
# X_train_tfidf = vectorizer.fit_transform(X_train['Description'])
# X_test_tfidf = vectorizer.transform(X_test['Description'])

# Train a Random Forest regressor model
# model = RandomForestRegressor(n_estimators=100, random_state=42)
# model.fit(X_train_tfidf, y_train)

# Predict stock symbols for the test set
# y_pred = model.predict(X_test_tfidf)

# Calculate Mean Squared Error
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error: {mse}")

In [20]:
df.to_csv('data_pull_0802.csv', index=False)

In [19]:
len(df['Stock Name'].unique())

32

In [18]:
df.groupby('Stock Name').mean()['Sentiment']

Stock Name
Alphabet                                  0.254535
Apple                                     0.266237
Arista Networks                           0.306775
Axon Enterprise                           0.302639
Bank of America                           0.061456
Bank of Montreal                          0.154227
Barrick Gold                              0.304206
Boeing                                    0.130618
Booking Holdings                          0.223077
Broadcom                                  0.300684
Charles Schwab                            0.180073
Dell Technologies                         0.348816
Franco-Nevada                             0.277600
General Dynamics                          0.220800
Goldman Sachs                             0.055744
HP Inc.                                   0.234636
JPMorgan Chase                            0.046795
Morgan Stanley                            0.258057
NVIDIA                                    0.218996
Newmont Goldcorp    