In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv("/Users/saamsani/Desktop/CMPT /stock_prediction_project/data/engineered_stock_data.csv")




In [None]:
print("What kind of stocks would you like to see today?")
print("Please choose a strategy:")

print("\nA → Safe short-term picks (short-term trading)")
print("    → These are stocks the model believes will go up tomorrow.")
print("    → Great for day trades or quick flips.\n")

print("B → Breakout stocks (medium risk, high reward)")
print("    → These are stocks expected to jump 10%+ in the next 3 days.")
print("    → Ideal for catching explosive momentum moves.\n")

print("C → Swing trades (steadier growth picks)")
print("    → These are stocks that should close higher within 5 days.")
print("    → Good for slightly longer holds and less stress.\n")

target_option = input("Enter A, B, or C: ").strip().upper()

target_map = {
    "A": "Target_A",
    "B": "Target_B",
    "C": "Target_C"
}

if target_option not in target_map:
    raise ValueError(" Invalid option. Please enter A, B, or C.")

selected_target = target_map[target_option]
print(f"\n Great! We'll predict using: {selected_target}")


In [None]:
# BASIC DAILY MOVEMENT FEATURES

# How much the stock moved that day (high - low)
df['Range'] = df['High'] - df['Low']

# How much the price changed in % from open to close
df['Change'] = (df['Close'] - df['Open']) / df['Open']

# Is the stock above or below its 10-day average? (>1 means above)
df['SMA_ratio'] = df['Close'] / df['SMA_10']

# How volatile it is relative to price (bigger = more unpredictable)
df['Volatility'] = df['ATR_14'] / df['Close']

# Makes big volume numbers easier to work with
df['LogVolume'] = np.log(df['Volume'] + 1)


# BOLLINGER BANDS (volatility envelope)

# How much the price tends to bounce around (standard deviation over 20 days)
rolling_std = df['Close'].rolling(window=20).std()

# Upper and lower bands around the moving average
df['BB_upper'] = df['SMA_10'] + (2 * rolling_std)
df['BB_lower'] = df['SMA_10'] - (2 * rolling_std)

# Width between bands — wider = more volatile
df['BB_width'] = df['BB_upper'] - df['BB_lower']


# MACD (momentum indicator)

# Tracks short- vs long-term momentum
ema_12 = df['Close'].ewm(span=12, adjust=False).mean()
ema_26 = df['Close'].ewm(span=26, adjust=False).mean()

df['MACD_line'] = ema_12 - ema_26  # Main momentum line
df['MACD_signal'] = df['MACD_line'].ewm(span=9, adjust=False).mean()  # Smooth signal
df['MACD_hist'] = df['MACD_line'] - df['MACD_signal']  # How strong the signal is



# STOCHASTIC OSCILLATOR (momentum speed)

# How close the price is to the highest high / lowest low of last 14 days
low_14 = df['Low'].rolling(window=14).min()
high_14 = df['High'].rolling(window=14).max()

df['Stoch_%K'] = 100 * ((df['Close'] - low_14) / (high_14 - low_14))
df['Stoch_%D'] = df['Stoch_%K'].rolling(window=3).mean()  # Smoothed version


# OBV (On-Balance Volume — volume trend)

# If price went up, add volume; if it went down, subtract volume
obv = [0]
for i in range(1, len(df)):
    if df['Close'].iloc[i] > df['Close'].iloc[i - 1]:
        obv.append(obv[-1] + df['Volume'].iloc[i])
    elif df['Close'].iloc[i] < df['Close'].iloc[i - 1]:
        obv.append(obv[-1] - df['Volume'].iloc[i])
    else:
        obv.append(obv[-1])
df['OBV'] = obv


# CMF (Chaikin Money Flow — price + volume pressure)

# Measures buying/selling pressure based on price position + volume
mf_multiplier = ((df['Close'] - df['Low']) - (df['High'] - df['Close'])) / (df['High'] - df['Low'] + 1e-6)
mf_volume = mf_multiplier * df['Volume']
df['CMF'] = mf_volume.rolling(window=20).sum() / df['Volume'].rolling(window=20).sum()


# ROC (Rate of Change — speed of price movement)

# How much the price has changed over 10 days (as a %)
df['ROC'] = df['Close'].pct_change(periods=10) * 100


feature_cols = [
    'SMA_10', 'EMA_10', 'RSI_14', 'ATR_14',
    'Close', 'Volume', 'Range', 'Change',
    'SMA_ratio', 'Volatility', 'LogVolume',
    'BB_upper', 'BB_lower', 'BB_width',
    'MACD_line', 'MACD_signal', 'MACD_hist',
    'Stoch_%K', 'Stoch_%D',
    'OBV', 'CMF', 'ROC'
]

# Remove any rows that have NaN values from the rolling calculations
df.dropna(subset=feature_cols + [selected_target], inplace=True)


X = df[feature_cols]  # input features
y = df[selected_target]  # output label (0 or 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print(f" Training samples: {len(X_train)}")
print(f" Testing samples: {len(X_test)}")


In [None]:
from sklearn.preprocessing import StandardScaler

# Create the scaler and fit it on training features only
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Re-split with scaled features
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, shuffle=True
)

print(" Feature scaling applied. Data re-split for training/testing.")


In [None]:
from sklearn.utils import class_weight

# Calculate class weights to handle imbalance
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

# Model Building
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),   
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train the model (with class weights added)
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=32,
    class_weight=class_weights,  
    verbose=1
)


In [None]:
# Predict on the test set
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Classification report
print(" Classification Report:")
print(classification_report(y_test, y_pred, digits=4))


In [None]:
# Ask user for their trading strategy preference
print("What kind of stocks would you like to see today?")
print("Please choose a strategy:")

print("\nA → Safe short-term picks (short-term trading)")
print("    → These are stocks the model believes will go up tomorrow.")
print("    → Great for day trades or quick flips.\n")

print("B → Breakout stocks (medium risk, high reward)")
print("    → These are stocks expected to jump 10%+ in the next 3 days.")
print("    → Ideal for catching explosive momentum moves.\n")

print("C → Swing trades (steadier growth picks)")
print("    → These are stocks that should close higher within 5 days.")
print("    → Good for slightly longer holds and less stress.\n")

# Get user input and map to the right target
target_option = input("Enter A, B, or C: ").strip().upper()

target_map = {
    "A": "Target_A",
    "B": "Target_B",
    "C": "Target_C"
}

if target_option not in target_map:
    raise ValueError(" Invalid option. Please enter A, B, or C.")

selected_target = target_map[target_option]
print(f"\n Great! Running predictions for: {selected_target}")


In [None]:
# Get the most recent trading day
latest_date = df['Date'].max()
latest_df = df[df['Date'] == latest_date].copy()

# Prepare feature matrix for the latest data
X_latest = latest_df[feature_cols]
X_latest_scaled = scaler.transform(X_latest)

# Predict probabilities using the trained model
probs = model.predict(X_latest_scaled).flatten()
preds = (probs >= 0.5).astype(int)

# Attach predictions back to the DataFrame
latest_df['Probability'] = probs
latest_df['Predicted'] = preds

# Filter for stocks the model believes will go up
winners = latest_df[latest_df['Predicted'] == 1][['Ticker', 'Probability']]


In [None]:
import yfinance as yf

# Define a function to fetch live stock prices
def fetch_live_price(ticker):
    try:
        return yf.Ticker(ticker).info['regularMarketPrice']
    except:
        return None

# Apply to all predicted winners
winners['LivePrice'] = winners['Ticker'].apply(fetch_live_price)

# Sort and keep the top 20 by confidence
winners = winners.sort_values(by='Probability', ascending=False).head(20).reset_index(drop=True)

# Show the final results
print(f"\n Top 20 stocks the model predicts will go UP based on: {selected_target}")
print("(Confidence shows how sure the model is)\n")

for i, row in winners.iterrows():
    print(f"{i+1:>2}. {row['Ticker']:>6} — Confidence: {row['Probability']:.2%} — Live Price: ${row['LivePrice']:.2f}")


In [None]:
import os

# Save the results to a CSV file
save_path = "/Users/saamsani/Desktop/CMPT /stock_prediction_project/data"
os.makedirs(save_path, exist_ok=True)

output_file = os.path.join(save_path, f"{selected_target.lower()}_top_20_tickers.csv")
winners.to_csv(output_file, index=False)

print(f"\n Results saved to: {output_file}")
