In [3]:
import requests
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to fetch historical stock data
def fetch_stock_data(symbol, api_key):
    base_url = 'https://www.alphavantage.co/query'
    function = 'TIME_SERIES_DAILY'
    outputsize = 'compact'  # Adjust as per your data needs
    params = {
        'function': function,
        'symbol': symbol,
        'outputsize': outputsize,
        'apikey': api_key
    }
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for bad responses (4xx or 5xx)
        data = response.json()
        if 'Time Series (Daily)' in data:
            df = pd.DataFrame(data['Time Series (Daily)']).T
            df.index = pd.to_datetime(df.index)
            df = df.astype(float)
            return df
        else:
            print(f"No stock data found for {symbol}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching stock data: {e}")
        return None

# Function to fetch news headlines for a stock symbol
def fetch_news_headlines(symbol, news_api_key):
    base_url = f'https://newsapi.org/v2/everything'
    params = {
        'q': symbol,
        'apiKey': news_api_key
    }
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for bad responses (4xx or 5xx)
        data = response.json()
        if 'articles' in data:
            headlines = [article['title'] for article in data['articles']]
            return headlines
        else:
            print(f"No articles found for {symbol}")
            return []
    except requests.exceptions.RequestException as e:
        print(f"Error fetching news data: {e}")
        return []

# Alpha Vantage API key (replace with your own key)
api_key = 'H84F99ZJFXZWJ9UI'

# News API key (replace with your own key)
news_api_key = '18e008e7cf6d43cc9c826b2d991ddb8f'

# Example usage to fetch data for a stock (e.g., Apple)
symbol = 'AAPL'
stock_data = fetch_stock_data(symbol, api_key)
news_headlines = fetch_news_headlines(symbol, news_api_key)

# Example of feature engineering combining textual and financial data
if stock_data is not None and news_headlines:
    # Extract financial features (e.g., moving averages)
    stock_data['MA_50'] = stock_data['4. close'].rolling(window=50).mean()
    stock_data['MA_200'] = stock_data['4. close'].rolling(window=200).mean()

    # Combine textual data (news headlines)
    headlines_text = ' '.join(news_headlines)

    # Perform text preprocessing
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(headlines_text.lower())
    filtered_tokens = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)

    # Combine financial and textual features
    X = stock_data[['MA_50', 'MA_200']]
    X['headline_sentiment'] = preprocessed_text

    y = stock_data['4. close'].shift(-1)  # Predict next day's closing price

    # Drop NaN values
    X.dropna(inplace=True)
    y.dropna(inplace=True)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline for text vectorization and modeling
    model_pipeline = make_pipeline(
        TfidfVectorizer(max_features=1000),
        RandomForestRegressor(n_estimators=100, random_state=42)
    )

    # Train the model
    model_pipeline.fit(X_train['headline_sentiment'], y_train)

    # Evaluate the model
    y_pred = model_pipeline.predict(X_test['headline_sentiment'])
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

    # Example prediction (you can modify this based on your needs)
    last_headline = [preprocessed_text]  # Use the latest available headline
    predicted_price = model_pipeline.predict(last_headline)
    print(f'Predicted price for {symbol}: {predicted_price[0]}')
else:
    print(f'Failed to fetch data for {symbol} or no news headlines available')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['headline_sentiment'] = preprocessed_text
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplac

ValueError: Found input variables with inconsistent numbers of samples: [0, 99]