<a href="https://colab.research.google.com/github/PARTSI14/LAB-STELIOS/blob/main/%CE%91%CE%BD%CF%84%CE%AF%CE%B3%CF%81%CE%B1%CF%86%CE%BF_nasdaq_breakout_colab_autoupdate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# -*- coding: utf-8 -*-
# Google Colab Notebook — Nasdaq Breakout Finder (με auto-update βιβλιοθηκών)

# Ενημέρωση / εγκατάσταση βιβλιοθηκών
!pip install --quiet yfinance ta lightgbm pandas numpy tqdm
!pip install --quiet scikit-learn==1.2.2 scipy==1.10.1

import io
import os
import time
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import yfinance as yf
from tqdm.notebook import tqdm
import ta
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

# Παράμετροι
DAYS_HISTORY = 365 * 2
LOOKAHEAD_DAYS = 5
DONCHIAN_WINDOW = 20
BOLL_WINDOW = 20
RSI_WINDOW = 14
MIN_AVG_VOLUME = 10000
TOP_N = 100

# Λήψη λίστας NASDAQ tickers
nasdaq_list_url = 'https://www.nasdaqtrader.com/dynamic/symdir/nasdaqlisted.txt'
txt = pd.read_csv(nasdaq_list_url, sep='|', dtype=str)
candidate_tickers = txt[txt['ETF'] != 'Y']['Symbol'].values.tolist()
MAX_TICKERS = 600
candidate_tickers = candidate_tickers[:MAX_TICKERS]

# Ιστορικά δεδομένα
end = datetime.utcnow().date()
start = end - timedelta(days=DAYS_HISTORY)

def download_batch(tickers):
    data = {}
    raw = yf.download(
        tickers,
        start=start.isoformat(),
        end=(end + timedelta(days=1)).isoformat(),
        group_by='ticker',
        threads=True,
        auto_adjust=True,
        progress=False
    )
    if isinstance(raw.columns, pd.MultiIndex):
        for t in tickers:
            df = raw[t].copy()
            if df.dropna(how='all').shape[0] > 10:
                df.reset_index(inplace=True)
                data[t] = df
    else:
        df = raw.copy().reset_index()
        data[tickers[0]] = df
    return data

all_data = {}
BATCH = 50
for i in tqdm(range(0, len(candidate_tickers), BATCH)):
    batch = candidate_tickers[i:i+BATCH]
    d = download_batch(batch)
    all_data.update(d)
    time.sleep(1)

# Features & labels
features_list = []
meta_list = []
BREAKOUT_THRESHOLD = 0.00

for ticker, df in all_data.items():
    if 'Date' not in df.columns:
        continue
    df = df.set_index('Date').asfreq('B').ffill().dropna()

    if df.shape[0] < DONCHIAN_WINDOW + LOOKAHEAD_DAYS:
        continue
    if df['Volume'].mean() < MIN_AVG_VOLUME:
        continue

    don_high = df['High'].rolling(window=DONCHIAN_WINDOW).max()
    don_low = df['Low'].rolling(window=DONCHIAN_WINDOW).min()
    df['donchian_upper'] = don_high

    bb_mean = df['Close'].rolling(BOLL_WINDOW).mean()
    bb_std = df['Close'].rolling(BOLL_WINDOW).std()
    df['bb_width'] = (bb_mean + 2 * bb_std) - (bb_mean - 2 * bb_std)

    df['rsi'] = ta.momentum.rsi(df['Close'], window=RSI_WINDOW)
    df['atr'] = ta.volatility.average_true_range(df['High'], df['Low'], df['Close'], window=14)
    df['ma50'] = df['Close'].rolling(window=50).mean()
    df['ma200'] = df['Close'].rolling(window=200).mean()
    df['mom_5'] = df['Close'].pct_change(5)
    df['vol_avg_20'] = df['Volume'].rolling(window=20).mean()
    df['vol_spike'] = df['Volume'] / (df['vol_avg_20'] + 1)

    df['future_max_close'] = df['Close'].shift(-1).rolling(window=LOOKAHEAD_DAYS, min_periods=1).max().shift(-(LOOKAHEAD_DAYS - 1))
    df['label_breakout'] = (df['future_max_close'] > df['donchian_upper'] * (1 + BREAKOUT_THRESHOLD)).astype(int)

    hist = df.dropna()[['Close','donchian_upper','bb_width','rsi','atr','ma50','ma200','mom_5','vol_spike']].copy()
    hist['label'] = df['label_breakout'].astype(int)
    features_list.append(hist.reset_index())

    today = df.index.max()
    row = df.iloc[-1]
    # **ΠΡΟΣΘΗΚΗ ΣΤΟ meta_list (είχε λείψει)**
    meta_list.append({
        'ticker': ticker,
        'date': today,
        'Close': row['Close'],
        'donchian_upper': row['donchian_upper'],
        'bb_width': row['bb_width'],
        'rsi': row['rsi'],
        'atr': row['atr'],
        'ma50': row['ma50'],
        'ma200': row['ma200'],
        'mom_5': row['mom_5'],
        'vol_spike': row['vol_spike']
    })

feature_cols = ['Close','donchian_upper','bb_width','rsi','atr','ma50','ma200','mom_5','vol_spike']

big_df = pd.concat(features_list, ignore_index=True).dropna()
X = big_df[feature_cols].astype(float)
y = big_df['label'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'seed': 42
}

model = lgb.train(
    params,
    train_data,
    num_boost_round=200,
    valid_sets=[valid_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=0)
    ]
)

preds = model.predict(X_test)
print("ROC AUC:", roc_auc_score(y_test, preds))

meta_df = pd.DataFrame(meta_list).dropna()
meta_X = meta_df[feature_cols]
meta_df['breakout_prob'] = model.predict(meta_X)
meta_df = meta_df.sort_values('breakout_prob', ascending=False)

print(meta_df.head(TOP_N))
meta_df.to_csv('nasdaq_breakout_rankings.csv', index=False)

# --- Προβολή των 3 κορυφαίων για πιθανό breakout ---
top3 = meta_df.head(3)
print("\n🔝 Οι 3 κορυφαίες μετοχές με την υψηλότερη πιθανότητα breakout:")
for i, row in top3.iterrows():
    print(f"{row['ticker']}: Πιθανότητα {row['breakout_prob']:.2%}")

# **ΑΦΑΙΡΩ ΤΗ ΓΡΑΜΜΗ ΕΓΚΑΤΑΣΤΑΣΗΣ lightgbm ΠΟΥ ΕΙΧΕΣ ΣΤΟ ΤΕΛΟΣ**
# !pip install --upgrade lightgbm
# Δεν χρειάζεται να κάνεις ξανά training μετά το upgrade μέσα στο ίδιο script.


  0%|          | 0/12 [00:00<?, ?it/s]

Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[200]	valid_0's auc: 0.851607
ROC AUC: 0.8516074887884832
    ticker       date       Close  donchian_upper    bb_width        rsi  \
151   ALNY 2025-08-08  438.929993      439.359985  190.729602  88.786443   
294   ATMV 2025-08-08   11.912000       11.920000    0.040864  79.583985   
99   AGNCM 2025-08-08   25.129999       25.200001    0.332896  63.642753   
47     ACT 2025-08-08   37.119999       37.299999    2.527528  63.219232   
9     AAPL 2025-08-08  229.350006      231.000000   24.120290  73.900589   
..     ...        ...         ...             ...         ...        ...   
317   AVDX 2025-08-08    9.860000        9.880000    0.046566  60.556921   
415  BKHAU 2025-08-08   10.510000       10.510000    0.000000  39.065427   
242   ARHS 2025-08-08   11.220000       12.980000    3.853574  66.294680   
299   ATPC 2025-08-08    1.365000        1.500000    0.204144  48.908510 