In [2]:
import sys
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime
from yahooquery import Ticker
from transformers import pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from deep_translator import GoogleTranslator, exceptions

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# --- Data Collection and News Scraping ---
def get_news_data(query, num_pages=5):
    headlines, dates = [], []

    def convert_date(date_str):
        month_map = {"Jan":"Jan","Feb":"Feb","Mar":"Mar","Apr":"Apr",
                     "Mei":"May","Jun":"Jun","Jul":"Jul","Agu":"Aug",
                     "Sep":"Sep","Okt":"Oct","Nov":"Nov","Des":"Dec"}
        try:
            parts = date_str.split(',', 1)
            if len(parts) < 2:
                return None
            tokens = parts[1].strip().split()
            if len(tokens) < 3:
                return None
            day, mon, yr = tokens[0], month_map.get(tokens[1], tokens[1]), tokens[2]
            return datetime.strptime(f"{day} {mon} {yr}", "%d %b %Y").date()
        except Exception:
            return None

    def fetch_page(page):
        url = f"https://www.detik.com/search/searchnews?query={query}&page={page}&result_type=latest&siteid=29&fromdatex=01/01/2015&todatex=20/03/2025"
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "html.parser")
        page_head, page_dates = [], []
        for div in soup.select("div.media__text"):
            title_el = div.select_one("h3.media__title")
            date_el = div.select_one("div.media__date span[title]")
            if not title_el or not date_el:
                continue
            raw_date = date_el["title"]
            date = convert_date(raw_date)
            if date:
                page_head.append(title_el.get_text(strip=True))
                page_dates.append(date)
        return page_head, page_dates

    for p in range(1, num_pages + 1):
        hp, dp = fetch_page(p)
        headlines.extend(hp)
        dates.extend(dp)
        print(f"Fetched page {p}, total headlines: {len(headlines)}")
        time.sleep(1)

    df = pd.DataFrame({"date": dates, "headline": headlines})
    df.dropna(subset=["date"], inplace=True)
    return df

# --- Sentiment Analysis ---
sentiment_pipe = pipeline("text-classification", model="ProsusAI/finbert")
def get_sentiment(text):
    try:
        res = sentiment_pipe(text)
        label = res[0]["label"].lower()
        score = res[0]["score"]
        return -score if label == "negative" else 0 if label == "neutral" else score
    except Exception:
        return 0

# --- Translation for sentiment analysis ---
translator = GoogleTranslator(source='id', target='en')
def safe_translate(text, retries=3, delay=2):
    if pd.isna(text):
        return text
    for _ in range(retries):
        try:
            t = translator.translate(text)
            print(t)
            return t
        except exceptions.TranslationNotFound:
            time.sleep(delay)
    return text

# --- Technical Indicators ---
def calculate_metrics(df):
    df['21_MA'] = df['adjclose'].rolling(21).mean()
    df['Volatility'] = df['adjclose'].pct_change().rolling(21).std()
    rm = df['adjclose'].rolling(21).mean()
    rs = df['adjclose'].rolling(21).std()
    df['BB_Upper'] = rm + 2 * rs
    df['BB_Lower'] = rm - 2 * rs
    df['BB_Top_Hit'] = (df['adjclose'] >= df['BB_Upper']).astype(int)
    df['BB_Bottom_Hit'] = (df['adjclose'] <= df['BB_Lower']).astype(int)
    df['Percent_Change'] = df['adjclose'].pct_change() * 100
    return df.drop(columns=['BB_Upper', 'BB_Lower'])

# --- Model Definition ---
def custom_loss(y_true, y_pred):
    mse1 = tf.reduce_mean(tf.square(y_true[:,0] - y_pred[:,0]))
    mse30 = tf.reduce_mean(tf.square(y_true[:,1] - y_pred[:,1]))
    return mse1 + 5.0 * mse30

def create_model(window_size, num_features):
    model = Sequential([
        LSTM(30, activation='tanh', return_sequences=False,
             input_shape=(window_size, num_features), kernel_regularizer=l2(0.001)),
        Dropout(0.2),
        Dense(10, activation='relu'),
        Dense(2)
    ])
    model.compile(optimizer=Adam(0.01), loss=custom_loss)
    return model

Device set to use mps:0


In [None]:
stock = "BBCA.JK"
news_df = get_news_data("BBCA", num_pages=100)

Fetched page 1, total headlines: 10
Fetched page 2, total headlines: 20
Fetched page 3, total headlines: 30
Fetched page 4, total headlines: 40
Fetched page 5, total headlines: 50
Fetched page 6, total headlines: 60
Fetched page 7, total headlines: 70
Fetched page 8, total headlines: 80
Fetched page 9, total headlines: 90
Fetched page 10, total headlines: 100
Fetched page 11, total headlines: 110
Fetched page 12, total headlines: 120
Fetched page 13, total headlines: 130
Fetched page 14, total headlines: 140
Fetched page 15, total headlines: 150
Fetched page 16, total headlines: 160
Fetched page 17, total headlines: 170
Fetched page 18, total headlines: 180
Fetched page 19, total headlines: 190
Fetched page 20, total headlines: 200
Fetched page 21, total headlines: 210
Fetched page 22, total headlines: 220
Fetched page 23, total headlines: 230
Fetched page 24, total headlines: 240
Fetched page 25, total headlines: 250
Fetched page 26, total headlines: 254
Fetched page 27, total headlin

In [11]:
news_df['headline_eng'] = news_df['headline'].apply(safe_translate)
news_df['sentiment'] = news_df['headline_eng'].apply(get_sentiment)
daily_sent = news_df.groupby('date')['sentiment'].mean().reset_index()
daily_sent.rename(columns={'sentiment':'avg_sentiment'}, inplace=True)

JCI is estimated to be under 7,000, check the recommendations of shares here
Hit by layoffs, this man rose the pempek business with Rp. 10 million/month
IHSG and Asian Exchange are compact in the Red Zone
JCI Monday morning moves in the red zone to the level of 7,242
JCI is predicted to strengthen, see domestic and global sentiment
JCI Tuesday morning opened to 7,099
Compact with the Asian stock exchange, JCI started the week with reinforcement
List of goods and services that are not subject to 12% VAT next year
List of 5 conglomerates in the happiest country on earth
The story of strawberry farmers at the foot of Mount Slamet rap a turnover of Rp. 180 million/month
Batik Air passengers damaged aircraft window coverings can be fined Rp 2.5 billion!
JCI this morning opened red down 0.12%
Cheap car auction country starting at Rp. 49 million, this is the list
MNC shares to Sampoerna were kicked out of LQ45
Not closed, the dawn cake market wants to move locations
Peek at the turnover of th

In [None]:
ticker_obj = Ticker(stock)
stock_data = ticker_obj.history(period='10y', interval='1d')
if stock_data.empty:
    print("No data found for stock")
    sys.exit(1)

stock_data = stock_data.reset_index()
date_col = 'date' if 'date' in stock_data.columns else 'timestamp'

# Datetime conversion
raw_dates = stock_data[date_col]
def _to_naive(x):
    if hasattr(x, 'tzinfo') and x.tzinfo is not None:
        try:
            return x.tz_convert(None)
        except (AttributeError, TypeError):
            return x.replace(tzinfo=None)
    return x
naive_dates = raw_dates.map(_to_naive)
parsed = pd.to_datetime(naive_dates, errors='coerce')
stock_data['date'] = parsed.dt.date
stock_data.dropna(subset=['date'], inplace=True)

# Merge sentiment and prices
merged = pd.merge(stock_data, daily_sent, on='date', how='left')
merged['avg_sentiment'].fillna(0, inplace=True)
merged['Sentiment_MA'] = merged['avg_sentiment'].rolling(21).mean().shift(1)

df = calculate_metrics(merged)
df.dropna(inplace=True)

# Prepare features and targets
feature_cols = ['21_MA', 'volume', 'adjclose', 'Sentiment_MA']
X_all = df[feature_cols].values
prices = df['adjclose'].values

  has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["dividends"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["splits"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because 

In [13]:
window_size, out1, out30 = 30, 1, 30
X, y, base_prices = [], [], []
for i in range(window_size, len(df) - out30):
    X.append(X_all[i-window_size:i])
    base = prices[i-1]
    base_prices.append(base)
    y1 = np.log(prices[i+out1] / base)
    y30 = np.log(prices[i+out30] / base)
    y.append([y1, y30])
X, y, base_prices = map(np.array, (X, y, base_prices))

if X.size == 0:
    raise ValueError("Not enough data for sliding windows.")

# Split
n = len(X)
tr = int(n*0.7)
vl = int(n*0.15)
X_train, X_val, X_test = X[:tr], X[tr:tr+vl], X[tr+vl:]
y_train, y_val, y_test = y[:tr], y[tr:tr+vl], y[tr+vl:]
base_test = base_prices[tr+vl:]

# Scale
num_feat = X_train.shape[2]
scaler = MinMaxScaler()
X_train_s = scaler.fit_transform(X_train.reshape(-1, num_feat)).reshape(X_train.shape)
X_val_s = scaler.transform(X_val.reshape(-1, num_feat)).reshape(X_val.shape)
X_test_s = scaler.transform(X_test.reshape(-1, num_feat)).reshape(X_test.shape)

target_scaler = MinMaxScaler()
y_train_s = target_scaler.fit_transform(y_train)
y_val_s   = target_scaler.transform(y_val)
y_test_s  = target_scaler.transform(y_test)


In [14]:
model = create_model(window_size, num_feat)
es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
rl = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)
model.fit(X_train_s, y_train_s, epochs=50, batch_size=32,
            validation_data=(X_val_s, y_val_s), callbacks=[es, rl], verbose=1)

Epoch 1/50


  super().__init__(**kwargs)


[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.2910 - val_loss: 0.1699 - learning_rate: 0.0100
Epoch 2/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1291 - val_loss: 0.1332 - learning_rate: 0.0100
Epoch 3/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1289 - val_loss: 0.0689 - learning_rate: 0.0100
Epoch 4/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1344 - val_loss: 0.1549 - learning_rate: 0.0100
Epoch 5/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1162 - val_loss: 0.2301 - learning_rate: 0.0100
Epoch 6/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1254 - val_loss: 0.1631 - learning_rate: 0.0100
Epoch 7/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1267 - val_loss: 0.1113 - learning_rate: 0.0100
Epoch 8/50
[1m52

<keras.src.callbacks.history.History at 0x1112ba2e0>

In [15]:
y_pred_s = model.predict(X_test_s)
y_pred = target_scaler.inverse_transform(y_pred_s)
y_act  = target_scaler.inverse_transform(y_test_s)
p1d = base_test * np.exp(y_pred[:,0]); a1d = base_test * np.exp(y_act[:,0])
p30 = base_test * np.exp(y_pred[:,1]); a30 = base_test * np.exp(y_act[:,1])

for label, pred, act in [("1-Day", p1d, a1d), ("30-Day", p30, a30)]:
    rmse = np.sqrt(mean_squared_error(act, pred))
    mape = mean_absolute_percentage_error(act, pred)*100
    r2   = r2_score(act, pred)
    print(f"{label} -> RMSE: {rmse:.4f}, MAPE: {mape:.2f}%, R^2: {r2:.4f}")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
1-Day -> RMSE: 76.5576, MAPE: 2.20%, R^2: 0.8774
30-Day -> RMSE: 191.4028, MAPE: 5.33%, R^2: 0.4009
