<a href="https://colab.research.google.com/github/Shaurya-S0603/Stock-Market-Analyzer/blob/main/STOCK_MARKET_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CREATING THE BOT AND TRAINING IT**

**Importing Required Packages.**

In [11]:
!pip install yfinance ta xgboost lightgbm catboost pandas_datareader requests textblob --quiet

import numpy as np
import pandas as pd
import requests
import yfinance as yf
import matplotlib.pyplot as plt
import pandas_datareader.data as web
from textblob import TextBlob
from ta import add_all_ta_features
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

**Fetching data from drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive/StockMarketAnalyzer/')

CACHE_FILENAME ='/content/drive/StockMarketAnalyzer/stock_sentiment_cache.csv'

from google.colab import userdata
fmp = userdata.get('fmp')
fsa = userdata.get('fsa')

**Getting the training dataset and exploring it**

In [12]:
# Replace 'stock_symbol' with the stock symbol you want to analyze
stock_symbol = 'GOOG'
start_date = '2021-01-01'
end_date = pd.Timestamp.now().strftime('%Y-%m-%d')

FMP_API_KEY = fmp
FINNHUB_API_KEY = fsa

stock_data = yf.download(stock_symbol, start=start_date, end=end_date)
if isinstance(stock_data.columns, pd.MultiIndex):
    stock_data.columns = stock_data.columns.droplevel(1)
stock_data = stock_data.interpolate(method='time').bfill().ffill().copy()

[*********************100%***********************]  1 of 1 completed


**FMP,AV,FSA,NDL Data**

In [13]:
def fetch_fundamental_data(symbol, apikey):
    url = f'https://financialmodelingprep.com/api/v3/income-statement/{symbol}?limit=1&apikey={apikey}'
    resp = requests.get(url)
    data = resp.json()[0] if resp.status_code == 200 and isinstance(resp.json(), list) and len(resp.json()) else {}
    return {
        "eps": data.get("eps", np.nan),
        "grossProfit": data.get("grossProfit", np.nan),
        "revenue": data.get("revenue", np.nan),
        "costOfRevenue": data.get("costOfRevenue", np.nan),
    }

fundamentals = fetch_fundamental_data(stock_symbol, FMP_API_KEY)
for k, v in fundamentals.items():
    stock_data[k] = v

try:
    cpi = web.DataReader('CPIAUCSL', 'fred', start_date, end_date)
    cpi = cpi.reindex(stock_data.index).ffill().bfill()
    stock_data['cpius'] = cpi.squeeze().values
except Exception as e:
    print(f"Error fetching macro data: {e}")
    stock_data['cpius'] = np.nan

def fetch_news_headlines(symbol, date_str, apikey):
    url = f'https://finnhub.io/api/v1/company-news?symbol={symbol}&from={date_str}&to={date_str}&token={apikey}'
    try:
        resp = requests.get(url)
        news = resp.json()
        return [article['headline'] for article in news if 'headline' in article]
    except Exception:
        return []

def compute_sentiment(headlines):
    if not headlines:
        return 0.0
    scores = [TextBlob(headline).sentiment.polarity for headline in headlines]
    return np.mean(scores)

def build_sentiment_index(symbol, dates, apikey):
    sentiment_vals = []
    for date in dates:
        date_str = date.strftime('%Y-%m-%d')
        headlines = fetch_news_headlines(symbol, date_str, apikey)
        sentiment_score = compute_sentiment(headlines)
        sentiment_vals.append(sentiment_score)
    return pd.Series(sentiment_vals, index=dates)

def load_or_build_sentiment(symbol, dates, apikey, cache_file):
    if os.path.exists(cache_file):
        print(f"Loading cached sentiment data from {cache_file} ...")
        cached = pd.read_csv(cache_file, index_col=0, parse_dates=True)
        missing_dates = dates.difference(cached.index)
        if len(missing_dates) == 0:
            return cached['sentiment']
        else:
            print(f"Fetching sentiment for missing {len(missing_dates)} dates...")
            new_sentiments = build_sentiment_index(symbol, missing_dates, apikey)
            combined = pd.concat([cached['sentiment'], new_sentiments])
            combined = combined.sort_index()
            combined.to_csv(cache_file)
            return combined
    else:
        print("Cache not found, fetching sentiment for all dates...")
        sentiments = build_sentiment_index(symbol, dates, apikey)
        sentiments.to_csv(cache_file)
        return sentiments

print("Fetching historical news sentiment, this may take a few minutes depending on date range...")
hist_sentiment = load_or_build_sentiment(stock_symbol, stock_data.index, FINNHUB_API_KEY, CACHE_FILENAME)
stock_data['news_sentiment'] = hist_sentiment.reindex(stock_data.index).fillna(0)

Fetching historical news sentiment, this may take a few minutes depending on date range...


**Feature Engineering**

In [17]:
def engineer_features(df):
    data = df.copy()
    original_index = data.index
    data = data.reset_index()

    for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
        if col in data.columns:
            if isinstance(data[col], pd.DataFrame):
                data[col] = data[col].iloc[:, 0]
            else:
                data[col] = data[col].squeeze()

    data = add_all_ta_features(
        data, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True
    )
    data = data.set_index(original_index)

    for lag in range(1, 6):
        data[f'Close_Lag_{lag}'] = data['Close'].shift(lag)
    data['Volatility_21'] = data['Close'].rolling(window=21).std()

    if not data.dropna().empty:
        data.dropna(inplace=True)
    else:
        print("Warning: Dropping NaNs resulted in an empty DataFrame. Consider adjusting the date range or feature engineering steps.")

    return data

engineered_data = engineer_features(stock_data)



**Implementing the SMA (Simple Moving Avg) Stratergy**


In [15]:
def SMA_strategy(data, short_window=20, long_window=50):
    signals = pd.DataFrame(index=data.index)
    signals['signal'] = 0.0
    signals['short_mavg'] = data['Close'].rolling(window=short_window, min_periods=1).mean()
    signals['long_mavg'] = data['Close'].rolling(window=long_window, min_periods=1).mean()
    signals.loc[signals.index[short_window:], 'signal'] = np.where(
        signals['short_mavg'].iloc[short_window:] > signals['long_mavg'].iloc[short_window:], 1.0, 0.0
    )
    signals['positions'] = signals['signal'].diff()
    return signals

signals = SMA_strategy(engineered_data)

# **PREDICTION**

In [None]:
X = engineered_data.drop(columns=['Close', 'Date'], errors='ignore')
y = engineered_data['Close']

tscv = TimeSeriesSplit(n_splits=5)
xgb_params = {'n_estimators': [2000, 5000], 'learning_rate': [0.001, 0.002]}
model = XGBRegressor(objective='reg:squarederror', random_state=42)
grid = GridSearchCV(model, param_grid=xgb_params, cv=tscv, scoring='neg_mean_squared_error')
grid.fit(X, y)

best_model = grid.best_estimator_

split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx].copy(), X.iloc[split_idx:].copy()
y_train, y_test = y.iloc[:split_idx].copy(), y.iloc[split_idx:].copy()

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(test_mse)
r2 = r2_score(y_test, y_pred)
residuals = y_test - y_pred
error_std = np.std(residuals)

future_days = 5
future_dates = pd.bdate_range(start=engineered_data.index[-1] + pd.Timedelta(days=1), periods=future_days)
last_known_features = X.iloc[-1].copy()
future_predictions = []
last_close = y.iloc[-1]

for i in range(future_days):
    pred_close = best_model.predict(last_known_features.values.reshape(1, -1))[0]
    future_predictions.append(pred_close)
    for lag in range(5, 1, -1):
        last_known_features[f'Close_Lag_{lag}'] = last_known_features[f'Close_Lag_{lag-1}']
    last_known_features['Close_Lag_1'] = pred_close
    if 'Close' in last_known_features.index:
        last_known_features['Close'] = pred_close

future_df = pd.DataFrame({'Date': future_dates, 'Predicted_Close': future_predictions}).set_index('Date')
predicted_return = (future_df['Predicted_Close'].iloc[-1] - last_close) / last_close * 100

buy_threshold = 1.0
sell_threshold = -1.0
if predicted_return > buy_threshold:
    decision = "Invest (Buy)"
elif predicted_return < sell_threshold:
    decision = "Pull out (Sell)"
else:
    decision = "Hold / No action"

conf_interval = (predicted_return - 2 * error_std, predicted_return + 2 * error_std)


**Results**

In [None]:
print(f"Investment Decision based on 5-day forecast: {decision}")
print(f"Predicted 5-day return: {predicted_return:.3f}%")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R^2 Score: {r2:.4f}")
print(f"Approximate 95% confidence interval for predicted return error: [{conf_interval[0]:.3f}%, {conf_interval[1]:.3f}%]")
print(f"\n5-Day Forecast:\n{future_df}")

**Plot: Actual vs Forecast**

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(engineered_data.index, engineered_data['Close'], label='Actual Close Price', color='blue')
plt.plot(future_df.index, future_df['Predicted_Close'], label='Predicted Close Price', color='red', linestyle='dashed')
plt.title(f"{stock_symbol} Close Price and 5-Day Forecast")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.show()