In [1]:
import pandas as pd

def calculate_high_vix_by_ticker(df):
    """
    Calculate the 'high_vix' feature for each ticker.
    VIX is compared to its 20-day rolling mean for each ticker.
    Assumes 'vix', 'tic', and 'date' columns exist in the DataFrame.
    """
    # Sort the data by ticker and date (ensure proper chronological order within each ticker)
    df = df.sort_values(by=['tic', 'date'])

    # Function to calculate high_vix for each group (per ticker)
    def calc_high_vix_for_ticker(group):
        # Calculate the 20-day rolling mean of VIX for each ticker
        group['vix_rolling_mean_20'] = group['vix'].rolling(window=20).mean()

        # Create 'high_vix' (1 if current VIX > 98% of rolling mean, else 0)
        group['high_vix'] = (group['vix'] > 0.98 * group['vix_rolling_mean_20']).astype(int)

        return group

    # Apply the function for each ticker (grouped by 'tic')
    df = df.groupby('tic').apply(calc_high_vix_for_ticker)

    # Optional: Remove rows where rolling mean is NaN (first 19 rows for each ticker)
    df = df.dropna(subset=['vix_rolling_mean_20'])

    return df

In [14]:
import pandas as pd
from stable_baselines3.common.logger import configure
import numpy as np
from finrl.agents.stablebaselines3.models import DRLAgent
from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.main import check_and_make_directories
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

# Create necessary directories
check_and_make_directories([TRAINED_MODEL_DIR])

# Load data
train = pd.read_csv('/home/group3/train.csv')

# Filter for one stock (AAPL)
train = train[train['tic'] == 'AAPL']

# Debug: Check indicators and dimensions
print("Indicators used:", INDICATORS)
print("Number of indicators:", len(INDICATORS))

stock_dimension = 1  # only AAPL
state_space = 1 + 2 * stock_dimension + len(INDICATORS) * stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space (calculated): {state_space}")

buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,  # required in newer FinRL versions
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}

# Create training environment
train = train.reset_index(drop=True)
e_train_gym = StockTradingEnv(df=train, user_defined_feature=INDICATORS, **env_kwargs)

# Optional: validate observation shape
obs,_ = e_train_gym.reset()
obs = np.array(obs)
print("âœ… Observation shape from environment:", obs.shape)

# Convert to Stable-Baselines compatible env
env_train, _ = e_train_gym.get_sb_env()
print("âœ… Stable-Baselines environment type:", type(env_train))

# Initialize DRL Agent
agent = DRLAgent(env=env_train)

# Choose algorithms to train
if_using_a2c = True
if_using_ddpg = True
if_using_ppo = True
if_using_td3 = True
if_using_sac = True

# Train A2C
if if_using_a2c:
    model_a2c = agent.get_model("a2c")

    # Set up logger
    tmp_path = RESULTS_DIR + '/a2c'
    new_logger_a2c = configure(tmp_path, ["stdout", "csv", "tensorboard"])
    model_a2c.set_logger(new_logger_a2c)

    trained_a2c = agent.train_model(
        model=model_a2c,
        tb_log_name='a2c',
        total_timesteps=50000
    )

    trained_a2c.save(TRAINED_MODEL_DIR + "/agent_a2c")
    print("âœ… A2C model training complete and saved.")


Indicators used: ['macd', 'boll_ub', 'boll_lb', 'rsi_30', 'cci_30', 'dx_30', 'close_30_sma', 'close_60_sma']
Number of indicators: 8
Stock Dimension: 1, State Space (calculated): 11
state: [1000000, np.float64(2.7309935092926025), 0, np.float64(0.0), np.float64(2.951623720329186), np.float64(2.6256222303696664), np.float64(100.0), np.float64(66.66666666666667), np.float64(100.0), np.float64(2.7309935092926025), np.float64(2.7309935092926025), np.float64(0.0), np.float64(2.951623720329186), np.float64(2.6256222303696664), np.float64(100.0), np.float64(66.66666666666667), np.float64(100.0), np.float64(2.7309935092926025), np.float64(2.7309935092926025)]
np.array(state).shape: (19,)
âœ… Observation shape from environment: (19,)
state: [1000000, np.float64(2.7309935092926025), 0, np.float64(0.0), np.float64(2.951623720329186), np.float64(2.6256222303696664), np.float64(100.0), np.float64(66.66666666666667), np.float64(100.0), np.float64(2.7309935092926025), np.float64(2.7309935092926025), 

ValueError: could not broadcast input array from shape (19,) into shape (11,)

In [2]:
from newsapi import NewsApiClient
import nltk
nltk.download('vader_lexicon')

# Replace with your actual API key
newsapi = NewsApiClient(api_key='YOUR_NEWSAPI_KEY')
def fetch_market_news(query="stock market", language="en", page_size=5):
    articles = newsapi.get_everything(
        q=query,
        language=language,
        sort_by='publishedAt',
        page_size=page_size
    )
    headlines = [article['title'] + ". " + article['description'] for article in articles['articles']]
    return headlines
from transformers import pipeline
nltk.download('vader_lexicon')
newsapi = NewsApiClient(api_key='db0eb7201950435b993447c710308800')

# Load zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def detect_market_regime(news_summary):
    labels = ["bullish", "bearish", "volatile", "neutral"]
    result = classifier(news_summary, candidate_labels=labels)
    return result["labels"][0]  # Highest score
headlines = fetch_market_news()

for i, news in enumerate(headlines):
    regime = detect_market_regime(news)
    print(f"\nðŸ“° News {i+1}: {news}")
    print(f"ðŸ“ˆ Detected Regime: {regime}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/group3/nltk_data...
  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/group3/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cuda:0



ðŸ“° News 1: Share market update: Most active stocks of the day in terms of traded value. The NSE Nifty index closed 140.5 points  down  at 24971.9
ðŸ“ˆ Detected Regime: volatile

ðŸ“° News 2: Unilever to acquire men's personal care brand Dr Squatch. Unilever has been divesting and selectively acquiring personal care brands in recent years.
ðŸ“ˆ Detected Regime: volatile

ðŸ“° News 3: ORG Partners LLC Has $69,000 Stock Holdings in Howmet Aerospace Inc. (NYSE:HWM). ORG Partners LLC lifted its position in shares of Howmet Aerospace Inc. (NYSE:HWM â€“ Free Report) by 31.5% during the 1st quarter, Holdings Channel reports. The firm owned 526 shares of the companyâ€™s stock after purchasing an additional 126 shares during the qâ€¦
ðŸ“ˆ Detected Regime: bullish

ðŸ“° News 4: Stonegate Investment Group LLC Acquires Shares of 684 Synopsys, Inc. (NASDAQ:SNPS). Stonegate Investment Group LLC acquired a new stake in Synopsys, Inc. (NASDAQ:SNPS â€“ Free Report) during the first quarter, Holdings 

In [16]:

#Train Model
import pandas as pd
from stable_baselines3.common.logger import configure
from finrl.agents.stablebaselines3.models import DRLAgent
from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.main import check_and_make_directories
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

check_and_make_directories([TRAINED_MODEL_DIR])
train = pd.read_csv('/home/group3/train.csv')
# If you are not using the data generated from part 1 of this tutorial, make sure 
# it has the columns and index in the form that could be make into the environment. 
# Then you can comment and skip the following two lines.
train = train[train['tic'] == 'AAPL']

stock_dimension = 1
state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")


buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}
train = train.reset_index(drop=True)
e_train_gym = StockTradingEnv(df = train,user_defined_feature=INDICATORS,**env_kwargs)

env_train, _ = e_train_gym.get_sb_env()
print(type(env_train))

agent = DRLAgent(env = env_train)

# Set the corresponding values to 'True' for the algorithms that you want to use
if_using_a2c = True
if_using_ddpg = True
if_using_ppo = True
if_using_td3 = True
if_using_sac = True
agent = DRLAgent(env = env_train)
PPO_PARAMS = {
    "n_steps": 2048,
    "ent_coef": 0.01,
    "learning_rate": 0.00025,
    "batch_size": 128,
}

model_ppo = agent.get_model("ppo",model_kwargs = PPO_PARAMS)

if if_using_ppo:
  # set up logger
  tmp_path = RESULTS_DIR + '/ppo'
  new_logger_ppo = configure(tmp_path, ["stdout", "csv", "tensorboard"])
  # Set new logger
  model_ppo.set_logger(new_logger_ppo)

trained_ppo = agent.train_model(model=model_ppo, 
                             tb_log_name='ppo',
                             total_timesteps=200000) if if_using_ppo else None

df_account_value_ppo, df_actions_ppo = DRLAgent.DRL_prediction(
    model=trained_ppo, 
    environment = e_trade_gym) if if_using_ppo else (None, None)

def process_df_for_mvo(df):
  return df.pivot(index="date", columns="tic", values="close")

!https://www.kaggle.com/code/vijipai/lesson-5-mean-variance-optimization-of-portfolios/notebook

def StockReturnsComputing(StockPrice, Rows, Columns): 
  import numpy as np 
  StockReturn = np.zeros([Rows-1, Columns]) 
  for j in range(Columns):        # j: Assets 
    for i in range(Rows-1):     # i: Daily Prices 
      StockReturn[i,j]=((StockPrice[i+1, j]-StockPrice[i,j])/StockPrice[i,j])* 100 
      
  return StockReturn

StockData = process_df_for_mvo(train)
TradeData = process_df_for_mvo(trade)

TradeData.to_numpy()

#compute asset returns
arStockPrices = np.asarray(StockData)
[Rows, Cols]=arStockPrices.shape
arReturns = StockReturnsComputing(arStockPrices, Rows, Cols)

#compute mean returns and variance covariance matrix of returns
meanReturns = np.mean(arReturns, axis = 0)
covReturns = np.cov(arReturns, rowvar=False)
 
#set precision for printing results
np.set_printoptions(precision=3, suppress = True)

#display mean returns and variance-covariance matrix of returns
print('Mean returns of assets in k-portfolio 1\n', meanReturns)
print('Variance-Covariance matrix of returns\n', covReturns)

Stock Dimension: 1, State Space: 11
state: [1000000, np.float64(2.7309935092926025), 0, np.float64(0.0), np.float64(2.951623720329186), np.float64(2.6256222303696664), np.float64(100.0), np.float64(66.66666666666667), np.float64(100.0), np.float64(2.7309935092926025), np.float64(2.7309935092926025), np.float64(0.0), np.float64(2.951623720329186), np.float64(2.6256222303696664), np.float64(100.0), np.float64(66.66666666666667), np.float64(100.0), np.float64(2.7309935092926025), np.float64(2.7309935092926025)]
np.array(state).shape: (19,)


ValueError: could not broadcast input array from shape (19,) into shape (11,)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from stable_baselines3 import A2C, DDPG, PPO, SAC, TD3
from finrl.agents.stablebaselines3.models import DRLAgent
from finrl.config import INDICATORS, TRAINED_MODEL_DIR
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
%matplotlib inline
train = pd.read_csv('train_data.csv')
trade = pd.read_csv('trade_data.csv')
train = train.set_index(train.columns[0])
train.index.names = ['']
trade = trade.set_index(trade.columns[0])
trade.index.names = ['']
if_using_a2c = True
if_using_ddpg = True
if_using_ppo = True
if_using_td3 = True
if_using_sac = True


trained_ppo = PPO.load(TRAINED_MODEL_DIR + "/agent_ppo") if if_using_ppo else None



stock_dimension = len(trade.tic.unique())
state_space = 1 + 2 * stock_dimension + len(INDICATORS) * stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")
buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension
env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4}
e_trade_gym = StockTradingEnv(df = trade, turbulence_threshold = 70,risk_indicator_col='vix', **env_kwargs)
# env_trade, obs_trade = e_trade_gym.get_sb_env()




df_account_value_ppo, df_actions_ppo = DRLAgent.DRL_prediction(
    model=trained_ppo, 
    environment = e_trade_gym) if if_using_ppo else (None, None)
def process_df_for_mvo(df):
  return df.pivot(index="date", columns="tic", values="close")




StockData = process_df_for_mvo(train)
TradeData = process_df_for_mvo(trade)
TradeData.to_numpy()
from pypfopt.efficient_frontier import EfficientFrontier
ef_mean = EfficientFrontier(meanReturns, covReturns, weight_bounds=(0, 0.5))
raw_weights_mean = ef_mean.max_sharpe()
cleaned_weights_mean = ef_mean.clean_weights()
# Wenn die Reihenfolge wichtig ist
asset_keys = list(cleaned_weights_mean.keys())
mvo_weights = np.array([1000000 * cleaned_weights_mean[asset] for asset in asset_keys])
LastPrice = np.array([1/p for p in StockData.tail(1).to_numpy()[0]])
Initial_Portfolio = np.multiply(mvo_weights, LastPrice)
Initial_Portfolio
Portfolio_Assets = TradeData @ Initial_Portfolio
MVO_result = pd.DataFrame(Portfolio_Assets, columns=["Mean Var"])




df_result_ppo = (
    df_account_value_ppo.set_index(df_account_value_ppo.columns[0])
    if if_using_ppo
    else None)

result = pd.DataFrame(
    {
        "ppo": df_result_ppo["account_value"] if if_using_ppo else None,
    })
print(result)
plt.rcParams["figure.figsize"] = (15,5)
plt.figure()
result.plot()

In [None]:
import pandas as pd

# Load your CSV file
df = pd.read_csv('apple_news_30_days_daily.csv')

# Drop the 'url' column
df = df.drop(columns=['url'])

# Save the result back to CSV (overwrite or new file)
df.to_csv('your_file_no_url.csv', index=False)

print("Removed 'url' column and saved the new file.")


In [None]:
import pandas as pd
from transformers import pipeline

# Load your CSV
df = pd.read_csv('your_file_no_url.csv')

# Initialize sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Function to convert model output to a sentiment score
def get_sentiment_score(text):
    result = sentiment_pipeline(text)[0]
    label = result['label']
    score = result['score']
    # Convert to numeric score: positive = +score, negative = -score
    sentiment = score if label == 'POSITIVE' else -score
    return round(sentiment, 5)

# Apply to your title column
df['sentiment_score'] = df['title'].astype(str).apply(get_sentiment_score)

# Save results
df.to_csv('your_file_with_sentiment.csv', index=False)

print(df[['date','title', 'sentiment_score']].head())

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time

API_KEY = '3037f06ec44841cab9703c8de7b5ac0d'

end_date = datetime.today()
start_date = end_date - timedelta(days=30)



all_records = []

# Loop through each day
current_date = start_date
while current_date < end_date:
    next_date = current_date + timedelta(days=1)

    url = (
        'https://newsapi.org/v2/everything?'
        f'q=Apple&'
        f'from={current_date.strftime("%Y-%m-%d")}&'
        f'to={next_date.strftime("%Y-%m-%d")}&'
        'sortBy=publishedAt&'
        'language=en&'
        'pageSize=100&'
        f'apiKey={API_KEY}'
    )

    response = requests.get(url)
    data = response.json()

    if data['status'] == 'ok':
        articles = data['articles']
        for article in articles:
            all_records.append({
                'date': article['publishedAt'],
                'title': article['title'],
                'url': article['url']
            })
        print(f"Fetched {len(articles)} articles for {current_date.strftime('%Y-%m-%d')}")
    else:
        print(f"Error fetching data for {current_date.strftime('%Y-%m-%d')}: {data.get('message')}")

    current_date = next_date

    # Be polite to API server, avoid rate limits
    time.sleep(1)  

# Save all data to CSV
df = pd.DataFrame(all_records)
df.to_csv('apple_news_30_days_daily.csv', index=False)
print(f"Saved total {len(df)} articles to apple_news_30_days_daily.csv")

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time

API_KEY = '3037f06ec44841cab9703c8de7b5ac0d'

end_date = datetime.today().replace(minute=0, second=0, microsecond=0)
start_date = end_date - timedelta(days=30)

all_records = []

current_hour = start_date
while current_hour < end_date:
    next_hour = current_hour + timedelta(hours=1)
    
    url = (
        'https://newsapi.org/v2/everything?'
        f'q=Apple&'
        f'from={current_hour.isoformat()}&'
        f'to={next_hour.isoformat()}&'
        'sortBy=publishedAt&'
        'language=en&'
        'pageSize=100&'
        f'apiKey={API_KEY}'
    )

    response = requests.get(url)
    data = response.json()

    if data['status'] == 'ok':
        articles = data['articles'][:1]  # Keep only first 4 articles per hour
        for article in articles:
            all_records.append({
                'date': article['publishedAt'],
                'title': article['title'],
                'url': article['url']
            })
        print(f"Fetched {len(articles)} articles for hour starting {current_hour}")
    else:
        print(f"Error fetching data for hour {current_hour}: {data.get('message')}")
    
    current_hour = next_hour
    time.sleep(1)  # avoid rate limits

df = pd.DataFrame(all_records)
df.to_csv('apple_news_4per_hour.csv', index=False)
print(f"Saved total {len(df)} articles to apple_news_4per_hour.csv")


In [None]:
import pandas as pd

# Load original data
df = pd.read_csv("train_data.csv")

# Filter only Apple (AAPL) data
df_apple = df[df['tic'] == 'AAPL'].copy()

# Reset index (optional but recommended)
df_apple.reset_index(drop=True, inplace=True)

# Save to new CSV file
df_apple.to_csv("apple_stock_data.csv", index=False)


In [None]:
import pandas as pd

# Load data
df = pd.read_csv('apple_stock_data.csv')

# Feature engineering
df['daily_return'] = df['close'].pct_change()
df.to_csv('apple_stock_data.csv', index=False)


In [None]:
df['volatility'] = df['daily_return'].rolling(window=10).std()
df.to_csv('apple_stock_data.csv', index=False)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load the data
df = pd.read_csv('apple_stock_data.csv')  # Replace with your actual file name

# Step 2: Define columns to scale
scaled_columns = [
    'close', 'high', 'low', 'open', 'volume',
    'macd', 'rsi_30', 'cci_30', 'dx_30',
    'close_30_sma', 'close_60_sma', 'vix', 'turbulence'
]

# Step 3: Initialize the scaler and fit_transform
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(df[scaled_columns])

# Step 4: Add new columns to df with '_scaled' suffix
for i, col in enumerate(scaled_columns):
    df[f'{col}_scaled'] = scaled_values[:, i]

# Step 5: Save the updated dataframe to a new CSV file
df.to_csv('apple_stock_scaled.csv', index=False)