In [None]:
import yfinance as yf
import pandas as pd
import os
from ta import add_all_ta_features
import ta.utils 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import schedule
import time
import tensorflow as tf
from tensorflow.keras import layers
import requests
from transformers import pipeline
from sklearn.impute import SimpleImputer
import numpy as np
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [None]:
def fetch_intraday_data(ticker, period="5d", interval="1m"):
    """Fetches intraday data for the last 5 days with a 1-minute interval."""
    try:
        data = yf.download(ticker, period=period, interval=interval)
        if data.empty:
            print(f"No data found for {ticker}")
            return None
        return data
    except Exception as e:
        print(f"Failed to fetch intraday data for {ticker}: {e}")
        return None

In [None]:
#List of Indian Midcap Stocks
midcap_stocks = [
    "MSUMI.NS", "TORNTPOWER.NS", "GODREJPROP.NS", "SRF.NS",
    "APLAPOLLO.NS", "TVSMOTOR.NS", "LTIM.NS", "PAGEIND.NS",
    "AUROPHARMA.NS", "JINDALSTEL.NS", "BAJAJHLDNG.NS", "BATAINDIA.NS",
    "BHEL.NS", "CANBK.NS", "CHOLAFIN.NS", "CUB.NS", "DALMIASUG.NS",
    "ESCORTS.NS", "FEDERALBNK.NS", "FORTIS.NS", "GICRE.NS",
    "GMRINFRA.NS", "GNFC.NS", "GODREJAGRO.NS", "GRASIM.NS", "HAVELLS.NS",
    "HINDPETRO.NS", "INDHOTEL.NS", "JUBLFOOD.NS", "LICHSGFIN.NS",
    "M&MFIN.NS", "MANAPPURAM.NS", "MRF.NS", "NATCOPHARM.NS",
    "NCC.NS", "NMDC.NS", "OBEROIRLTY.NS", "PERSISTENT.NS", "PETRONET.NS",
    "RAMCOCEM.NS", "RBLBANK.NS", "SAIL.NS", "SUNTV.NS", "TATACOMM.NS",
    "TATAPOWER.NS", "THYROCARE.NS", "TORNTPHARM.NS", "TRENT.NS", "VOLTAS.NS",
    "WHIRLPOOL.NS", "YESBANK.NS", "ZEEL.NS", "ZYDUSWELL.NS",
    "ABBOTINDIA.NS", "ASHOKLEY.NS", "BALKRISIND.NS", "BEL.NS", "CONCOR.NS",
    "CROMPTON.NS", "DEEPAKNTR.NS", "DIXON.NS", "EMAMILTD.NS",
    "INDIAMART.NS", "IRCTC.NS", "JUBLPHARMA.NS", "LTTS.NS", "MFSL.NS",
    "METROPOLIS.NS", "OBEROIRLTY.NS", "PIIND.NS", "POLYCAB.NS", "RECLTD.NS",
    "SUPREMEIND.NS", "TATACONSUM.NS", "TV18BRDCST.NS", "VGUARD.NS",
    "VBL.NS", "VINATIORGA.NS", "ZENSARTECH.NS", "IDFCFIRSTB.NS",
    "SONACOMS.NS", "AMBUJACEM.NS", "GAIL.NS", "TATAELXSI.NS", "MAXHEALTH.NS",
    "LALPATHLAB.NS", "JSWENERGY.NS", "AARTIIND.NS", "ADANIGREEN.NS",
    "ABFRL.NS", "BANDHANBNK.NS", "BANKINDIA.NS", "BERGEPAINT.NS", "BOSCHLTD.NS",
    "CUMMINSIND.NS", "DMART.NS", "GLENMARK.NS", "GUJGASLTD.NS",
    "HAL.NS", "IIFLWAM.NS", "LICI.NS", "LUXIND.NS", "M&MFIN.NS",
    "NAUKRI.NS", "PHOENIXLTD.NS", "RAJESHEXPO.NS", "SHREECEM.NS",
    "TATACHEM.NS", "THERMAX.NS", "TTKPRESTIG.NS", "UJJIVANSFB.NS", "VAKRANGEE.NS"
]

ticker_to_company_name = {
    "MSUMI.NS": "Motherson Sumi Systems Ltd.",
    "TORNTPOWER.NS": "Torrent Power Ltd.",
    "GODREJPROP.NS": "Godrej Properties Ltd.",
    "SRF.NS": "SRF Ltd.",
    "APLAPOLLO.NS": "APL Apollo Tubes Ltd.",
    "TVSMOTOR.NS": "TVS Motor Company Ltd.",
    "PAGEIND.NS": "Page Industries Ltd.",
    "AUROPHARMA.NS": "Aurobindo Pharma Ltd.",
    "JINDALSTEL.NS": "Jindal Steel & Power Ltd.",
    "BAJAJHLDNG.NS": "Bajaj Holdings & Investment Ltd.",
    "BATAINDIA.NS": "Bata India Ltd.",
    "BHEL.NS": "Bharat Heavy Electricals Ltd.",
    "CANBK.NS": "Canara Bank",
    "CHOLAFIN.NS": "Cholamandalam Investment and Finance Company Ltd.",
    "CUB.NS": "City Union Bank Ltd.",
    "DALMIASUG.NS": "Dalmia Bharat Sugar and Industries Ltd.",
    "ESCORTS.NS": "Escorts Ltd.",
    "FEDERALBNK.NS": "The Federal Bank Ltd.",
    "FORTIS.NS": "Fortis Healthcare Ltd.",
    "GICRE.NS": "General Insurance Corporation of India",
    "GMRINFRA.NS": "GMR Infrastructure Ltd.",
    "GNFC.NS": "Gujarat Narmada Valley Fertilizers & Chemicals Ltd.",
    "GODREJAGRO.NS": "Godrej Agrovet Ltd.",
    "GRASIM.NS": "Grasim Industries Ltd.",
    "HAVELLS.NS": "Havells India Ltd.",
    "HINDPETRO.NS": "Hindustan Petroleum Corporation Ltd.",
    "INDHOTEL.NS": "The Indian Hotels Company Ltd.",
    "JUBLFOOD.NS": "Jubilant FoodWorks Ltd.",
    "LICHSGFIN.NS": "LIC Housing Finance Ltd.",
    "M&MFIN.NS": "Mahindra & Mahindra Financial Services Ltd.",
    "MANAPPURAM.NS": "Manappuram Finance Ltd.",
    "MRF.NS": "MRF Ltd.",
    "NATCOPHARM.NS": "Natco Pharma Ltd.",
    "NCC.NS": "NCC Ltd.",
    "NMDC.NS": "NMDC Ltd.",
    "OBEROIRLTY.NS": "Oberoi Realty Ltd.",
    "PERSISTENT.NS": "Persistent Systems Ltd.",
    "PETRONET.NS": "Petronet LNG Ltd.",
    "RAMCOCEM.NS": "The Ramco Cements Ltd.",
    "RBLBANK.NS": "RBL Bank Ltd.",
    "SAIL.NS": "Steel Authority of India Ltd.",
    "SUNTV.NS": "Sun TV Network Ltd.",
    "TATACOMM.NS": "Tata Communications Ltd.",
    "TATAPOWER.NS": "Tata Power Company Ltd.",
    "THYROCARE.NS": "Thyrocare Technologies Ltd.",
    "TORNTPHARM.NS": "Torrent Pharmaceuticals Ltd.",
    "TRENT.NS": "Trent Ltd.",
    "VOLTAS.NS": "Voltas Ltd.",
    "WHIRLPOOL.NS": "Whirlpool of India Ltd.",
    "YESBANK.NS": "Yes Bank Ltd.",
    "ZEEL.NS": "Zee Entertainment Enterprises Ltd.",
    "ZYDUSWELL.NS": "Zydus Wellness Ltd.",
    "ABBOTINDIA.NS": "Abbott India Ltd.",
    "ASHOKLEY.NS": "Ashok Leyland Ltd.",
    "BALKRISIND.NS": "Balkrishna Industries Ltd.",
    "BEL.NS": "Bharat Electronics Ltd.",
    "CONCOR.NS": "Container Corporation of India Ltd.",
    "CROMPTON.NS": "Crompton Greaves Consumer Electricals Ltd.",
    "DEEPAKNTR.NS": "Deepak Nitrite Ltd.",
    "DIXON.NS": "Dixon Technologies (India) Ltd.",
    "EMAMILTD.NS": "Emami Ltd.",
    "INDIAMART.NS": "IndiaMART InterMESH Ltd.",
    "IRCTC.NS": "Indian Railway Catering and Tourism Corporation Ltd.",
    "JUBLPHARMA.NS": "Jubilant Pharmova Ltd.",
    "LTTS.NS": "L&T Technology Services Ltd.",
    "MFSL.NS": "Max Financial Services Ltd.",
    "METROPOLIS.NS": "Metropolis Healthcare Ltd.",
    "OBEROIRLTY.NS": "Oberoi Realty Ltd.",
    "PIIND.NS": "PI Industries Ltd.",
    "POLYCAB.NS": "Polycab India Ltd.",
    "RECLTD.NS": "REC Ltd.",
    "SUPREMEIND.NS": "Supreme Industries Ltd.",
    "TATACONSUM.NS": "Tata Consumer Products Ltd.",
    "TV18BRDCST.NS": "TV18 Broadcast Ltd.",
    "VGUARD.NS": "V-Guard Industries Ltd.",
    "VBL.NS": "Varun Beverages Ltd.",
    "VINATIORGA.NS": "Vinati Organics Ltd.",
    "ZENSARTECH.NS": "Zensar Technologies Ltd.",
    "IDFCFIRSTB.NS": "IDFC First Bank Ltd.",
    "SONACOMS.NS": "Sona BLW Precision Forgings Ltd.",
    "AMBUJACEM.NS": "Ambuja Cements Ltd.",
    "GAIL.NS": "GAIL (India) Ltd.",
    "TATAELXSI.NS": "Tata Elxsi Ltd.",
    "MAXHEALTH.NS": "Max Healthcare Institute Ltd.",
    "LALPATHLAB.NS": "Dr. Lal PathLabs Ltd.",
    "JSWENERGY.NS": "JSW Energy Ltd.",
    "AARTIIND.NS": "Aarti Industries Ltd.",
    "ADANIGREEN.NS": "Adani Green Energy Ltd.",
    "ABFRL.NS": "Aditya Birla Fashion and Retail Ltd.",
    "BANDHANBNK.NS": "Bandhan Bank Ltd.",
    "BANKINDIA.NS": "Bank of India",
    "BERGEPAINT.NS": "Berger Paints India Ltd.",
    "BOSCHLTD.NS": "Bosch Ltd.",
    "CUMMINSIND.NS": "Cummins India Ltd.",
    "DMART.NS": "Avenue Supermarts Ltd.",
    "GLENMARK.NS": "Glenmark Pharmaceuticals Ltd.",
    "GUJGASLTD.NS": "Gujarat Gas Ltd.",
    "HAL.NS": "Hindustan Aeronautics Ltd.",
    "LICI.NS": "Life Insurance Corporation of India",
    "LUXIND.NS": "Lux Industries Ltd.",
    "NAUKRI.NS": "Info Edge (India) Ltd.",
    "PHOENIXLTD.NS": "The Phoenix Mills Ltd.",
    "RAJESHEXPO.NS": "Rajesh Exports Ltd.",
    "SHREECEM.NS": "Shree Cement Ltd.",
    "TATACHEM.NS": "Tata Chemicals Ltd.",
    "THERMAX.NS": "Thermax Ltd.",
    "TTKPRESTIG.NS": "TTK Prestige Ltd.",
    "UJJIVANSFB.NS": "Ujjivan Small Finance Bank Ltd.",
    "VAKRANGEE.NS": "Vakrangee Ltd."
}


In [None]:
#directory to save the processed data
os.makedirs('data/processed', exist_ok=True)

In [None]:
def clean_data(df):
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
    return df

In [None]:
def add_technical_indicators(df):
    # Ensure no missing values (na) are present in the data
    
    if len(df) < 100:  #threshold, adjust as needed
        print(f"Insufficient data to calculate all indicators for this stock. Data length: {len(df)}")
        return df
    
    df = ta.utils.dropna(df)

    # Momentum Indicators
    df['SMA_50'] = ta.trend.SMAIndicator(close=df['Close'], window=50).sma_indicator()
    df['EMA_50'] = ta.trend.EMAIndicator(close=df['Close'], window=50).ema_indicator()
    df['RSI'] = ta.momentum.RSIIndicator(close=df['Close']).rsi()
    df['Stoch_RSI'] = ta.momentum.StochRSIIndicator(close=df['Close']).stochrsi()
    df['Williams_R'] = ta.momentum.WilliamsRIndicator(high=df['High'], low=df['Low'], close=df['Close']).williams_r()
    df['Awesome_Oscillator'] = ta.momentum.AwesomeOscillatorIndicator(high=df['High'], low=df['Low']).awesome_oscillator()
    df['MACD'] = ta.trend.MACD(close=df['Close']).macd()
    df['MACD_Signal'] = ta.trend.MACD(close=df['Close']).macd_signal()
    df['MACD_Diff'] = ta.trend.MACD(close=df['Close']).macd_diff()
    df['TSI'] = ta.momentum.TSIIndicator(close=df['Close']).tsi()
    df['KAMA'] = ta.momentum.KAMAIndicator(close=df['Close']).kama()
    df['ROC'] = ta.momentum.ROCIndicator(close=df['Close']).roc()

    # Trend Indicators
    df['Vortex_Diff'] = ta.trend.VortexIndicator(high=df['High'], low=df['Low'], close=df['Close']).vortex_indicator_diff()
    df['TRIX'] = ta.trend.TRIXIndicator(close=df['Close']).trix()
    df['Mass_Index'] = ta.trend.MassIndex(high=df['High'], low=df['Low']).mass_index()
    df['CCI'] = ta.trend.CCIIndicator(high=df['High'], low=df['Low'], close=df['Close']).cci()
    df['DPO'] = ta.trend.DPOIndicator(close=df['Close']).dpo()
    df['Ichimoku_A'] = ta.trend.IchimokuIndicator(high=df['High'], low=df['Low']).ichimoku_a()
    df['Ichimoku_B'] = ta.trend.IchimokuIndicator(high=df['High'], low=df['Low']).ichimoku_b()
    #Aroon Calculation
    window = 25
    rolling_high = df['Close'].rolling(window=window, min_periods=1).max()
    rolling_low = df['Close'].rolling(window=window, min_periods=1).min()
    df['Aroon_Up'] = 100 * df['Close'].rolling(window=window).apply(lambda x: (x.argmax() + 1) / window, raw=True)
    df['Aroon_Down'] = 100 * df['Close'].rolling(window=window).apply(lambda x: (x.argmin() + 1) / window, raw=True)
    df['Aroon_Indicator'] = df['Aroon_Up'] - df['Aroon_Down']
        
    # Volatility Indicators
    df['Bollinger_Mid'] = ta.volatility.BollingerBands(close=df['Close']).bollinger_mavg()
    df['Bollinger_Upper'] = ta.volatility.BollingerBands(close=df['Close']).bollinger_hband()
    df['Bollinger_Lower'] = ta.volatility.BollingerBands(close=df['Close']).bollinger_lband()
    df['Bollinger_PBand'] = ta.volatility.BollingerBands(close=df['Close']).bollinger_pband()
    df['Bollinger_WBand'] = ta.volatility.BollingerBands(close=df['Close']).bollinger_wband()
    df['Keltner_Channel_Center'] = ta.volatility.KeltnerChannel(high=df['High'], low=df['Low'], close=df['Close']).keltner_channel_mband()
    df['Keltner_Channel_Upper'] = ta.volatility.KeltnerChannel(high=df['High'], low=df['Low'], close=df['Close']).keltner_channel_hband()
    df['Keltner_Channel_Lower'] = ta.volatility.KeltnerChannel(high=df['High'], low=df['Low'], close=df['Close']).keltner_channel_lband()
    df['Donchian_Channel_Upper'] = ta.volatility.DonchianChannel(high=df['High'], low=df['Low'], close=df['Close']).donchian_channel_hband()
    df['Donchian_Channel_Lower'] = ta.volatility.DonchianChannel(high=df['High'], low=df['Low'], close=df['Close']).donchian_channel_lband()
    df['ATR'] = ta.volatility.AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close']).average_true_range()

    # Volume Indicators
    df['OBV'] = ta.volume.OnBalanceVolumeIndicator(close=df['Close'], volume=df['Volume']).on_balance_volume()
    df['Chaikin_MF'] = ta.volume.ChaikinMoneyFlowIndicator(high=df['High'], low=df['Low'], close=df['Close'], volume=df['Volume']).chaikin_money_flow()
    df['Force_Index'] = ta.volume.ForceIndexIndicator(close=df['Close'], volume=df['Volume']).force_index()
    df['Ease_of_Movement'] = ta.volume.EaseOfMovementIndicator(high=df['High'], low=df['Low'], volume=df['Volume']).ease_of_movement()
    df['Volume_Price_Trend'] = ta.volume.VolumePriceTrendIndicator(close=df['Close'], volume=df['Volume']).volume_price_trend()
    df['VWAP'] = ta.volume.VolumeWeightedAveragePrice(high=df['High'], low=df['Low'], close=df['Close'], volume=df['Volume']).volume_weighted_average_price()
    
    try:
        df['SMA_200'] = ta.trend.SMAIndicator(close=df['Close'], window=200).sma_indicator()
        df['EMA_200'] = ta.trend.EMAIndicator(close=df['Close'], window=200).ema_indicator()
        df['ADX'] = ta.trend.ADXIndicator(high=df['High'], low=df['Low'], close=df['Close']).adx()
        df['Vortex_Pos'] = ta.trend.VortexIndicator(high=df['High'], low=df['Low'], close=df['Close']).vortex_indicator_pos()
        df['Vortex_Neg'] = ta.trend.VortexIndicator(high=df['High'], low=df['Low'], close=df['Close']).vortex_indicator_neg()
    except ValueError as e:
        print(f"Could not calculate some indicators due to insufficient data: {e}")

    return df

In [None]:
def apply_pca(df, n_components=0.95):
    # Debugging: Print the columns before PCA
    print(f"Columns before PCA: {df.columns.tolist()}")
    
    # Exclude critical columns like 'Close' from PCA
    critical_columns = ['Close']
    non_critical_features = df.drop(columns=critical_columns, errors='ignore')

    # Handle missing values by imputing with column means
    imputer = SimpleImputer(strategy='mean')
    features_imputed = imputer.fit_transform(non_critical_features)

    pca = PCA(n_components=n_components)
    pca_features = pca.fit_transform(features_imputed)
    
    # Convert back to DataFrame
    pca_df = pd.DataFrame(pca_features, columns=[f'PC{i+1}' for i in range(pca_features.shape[1])])
    
    # Reattach the critical columns
    pca_df = pd.concat([df[critical_columns].reset_index(drop=True), pca_df], axis=1)
    
    return pca_df


In [None]:
def create_lagged_features(df, columns, lags=[1, 3, 5, 10]):
    for column in columns:
        for lag in lags:
            df[f'{column}_lag_{lag}'] = df[column].shift(lag)
    return df

In [None]:
NEWS_API_KEY =  'c3cee8f6f03c4788b3b68bc89cdbae42'
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
# NOT WORKING

def fetch_news_sentiment(ticker, company_name, start_date, end_date):
    """
    Fetches and analyzes news sentiment for a given company between the start and end dates using BERT.
    """
    url = f"https://newsapi.org/v2/top-headlines?country=in&category=business&apiKey={NEWS_API_KEY}"
    
    # Retry strategy
    retry_strategy = Retry(
        total=5,  # Number of retries
        backoff_factor=1,  # Wait 1 second between retries, then 2s, 4s, etc.
        status_forcelist=[429, 500, 502, 503, 504],
        method_whitelist=["HEAD", "GET", "OPTIONS"]
    )
    
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    
    try:
        response = http.get(url, verify=False)  # Disabling SSL verification (temporary solution)
        if response.status_code != 200:
            print(f"Failed to fetch news for {company_name}. Status Code: {response.status_code}")
            return pd.DataFrame()  # Return an empty DataFrame in case of failure
        
        articles = response.json().get("articles", [])
        
        if not articles:
            print(f"No articles found for {company_name}.")
            return pd.DataFrame()  # Return an empty DataFrame if no articles are found
        
        # Analyze sentiment of the news articles using BERT
        sentiment_data = []
        for article in articles:
            title = article["title"]
            date = article["publishedAt"][:10]  # Extract date from timestamp
            sentiment = sentiment_analyzer(title)[0]
            sentiment_score = sentiment["score"] if sentiment["label"] == "POSITIVE" else -sentiment["score"]
            sentiment_data.append({"Date": date, "Sentiment": sentiment_score})
        
        # Convert to DataFrame
        sentiment_df = pd.DataFrame(sentiment_data)
        
        # Aggregate by date
        daily_sentiment = sentiment_df.groupby("Date").mean().reset_index()
        daily_sentiment["Ticker"] = ticker
        
        return daily_sentiment

    except requests.exceptions.SSLError as e:
        print(f"SSL Error occurred: {e}")
        return pd.DataFrame()

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return pd.DataFrame()

In [None]:
def analyze_sentiment(news_df):
    analyzer = SentimentIntensityAnalyzer()
    news_df['Sentiment'] = news_df['Headline'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    return news_df

In [None]:
def update_data():
    for ticker in midcap_stocks:
        company_name = ' '.join(ticker.split('.')[0].split('_'))  # Example conversion from ticker to name
        process_stock_data(ticker, company_name)

In [None]:
def process_stock_data(ticker, company_name):
    print(f"Processing data for {ticker}...")

    # Attempt to download historical data
    try:
        data = yf.download(ticker, start="2014-01-01", end="2024-01-01", interval = '1m')
        if data.empty:
            print(f"Data download failed for {ticker}. Moving to the next stock.")
            return  # Skip processing this stock
    except Exception as e:
        print(f"Failed to download data for {ticker}: {e}")
        return  # Skip processing this stock

    # Proceed with data processing if download was successful
    data = clean_data(data)
    data = add_technical_indicators(data)

    # Create lagged features
    important_indicators = ['Close', 'RSI', 'MACD', 'Bollinger_Mid']
    data = create_lagged_features(data, important_indicators)
    data.dropna(inplace=True)  # Drop rows with NaN values due to lagging

    # Save the processed data
    data.to_csv(f'data/processed/{ticker}_final.csv', index=False)

    print(f"Finished processing data for {ticker}.")


In [None]:
def process_and_save_intraday_data(ticker, output_dir):
    """Fetches, processes, and saves intraday data for a given ticker."""
    intraday_data = fetch_intraday_data(ticker)
    if intraday_data is not None:
        processed_data = add_technical_indicators(intraday_data)
        processed_data.dropna(inplace=True)
        
        processed_data.reset_index(drop=True, inplace=True)
        
        # Create the output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        save_path = os.path.join(output_dir, f"{ticker}_intraday_processed.csv")
        processed_data.to_csv(save_path, index=True)
        print(f"Processed intraday data for {ticker} saved to {save_path}")


In [None]:
def process_all_intraday_data(tickers, output_dir="data/processed_intra_day"):
    """Processes intraday data for all tickers."""
    for ticker in tickers:
        process_and_save_intraday_data(ticker, output_dir)


In [None]:
for ticker, company_name in ticker_to_company_name.items():
    process_stock_data(ticker, company_name)

print("Data processing complete!")


In [None]:
process_all_intraday_data(midcap_stocks)

In [None]:
schedule.every().day.at("18:00").do(update_data)

In [None]:
while True:
    schedule.run_pending()
    time.sleep(1)