# ü§ñ Notebook 3 ‚Äî Model Training & Comparison

**Project:** Stock Trend Predictor  
**Goal:** Train all 5 models, compare performance, pick the best

### Models compared:
1. Logistic Regression (baseline)
2. SVM ‚Äî RBF Kernel
3. KNN ‚Äî K-Nearest Neighbors
4. Random Forest
5. XGBoost ‚≠ê

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

import yfinance as yf
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, classification_report,
                              confusion_matrix, ConfusionMatrixDisplay)
from xgboost import XGBClassifier

from features import add_technical_indicators, create_labels, prepare_feature_matrix
from sentiment import merge_sentiment_with_features

plt.style.use('seaborn-v0_8-whitegrid')
print('‚úÖ Imports done!')

## 1. Prepare Dataset

In [None]:
ticker = 'RELIANCE.NS'
df_raw = yf.download(ticker, start='2020-01-01', end='2024-12-31', progress=False)
if isinstance(df_raw.columns, pd.MultiIndex):
    df_raw.columns = df_raw.columns.get_level_values(0)

df = add_technical_indicators(df_raw.copy())
df = create_labels(df, n_days=5)
df = merge_sentiment_with_features(df, ticker)
df = prepare_feature_matrix(df)
df.dropna(subset=['Label'], inplace=True)
df['Label'] = df['Label'].astype(int)

print(f'Dataset shape: {df.shape}')
print(f'Label distribution:\n{df["Label"].value_counts().sort_index()}')

## 2. Train / Test Split (Chronological ‚Äî NO shuffling!)

In [None]:
# ‚ö†Ô∏è IMPORTANT: For time-series, ALWAYS split chronologically
# Never use train_test_split with shuffle=True on stock data!
split_idx = int(len(df) * 0.8)

train = df.iloc[:split_idx]
test  = df.iloc[split_idx:]

X_train = train.drop(columns=['Label'])
y_train = train['Label']
X_test  = test.drop(columns=['Label'])
y_test  = test['Label']

print(f'Train: {len(X_train)} samples  ({train.index[0].date()} ‚Üí {train.index[-1].date()})')
print(f'Test:  {len(X_test)}  samples  ({test.index[0].date()} ‚Üí {test.index[-1].date()})')

# Scale features (important for SVM, KNN, LR)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)  # fit ONLY on train!
X_test_s  = scaler.transform(X_test)       # transform test with same scaler

print('\n‚úÖ Train/Test split done ‚Äî chronological order preserved!')

## 3. Train All 5 Models

In [None]:
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000, C=1.0, random_state=42), True),
    'SVM':                 (SVC(kernel='rbf', probability=True, random_state=42),       True),
    'KNN':                 (KNeighborsClassifier(n_neighbors=7, weights='distance'),     True),
    'Random Forest':       (RandomForestClassifier(n_estimators=200, random_state=42),  False),
    'XGBoost':             (XGBClassifier(n_estimators=200, learning_rate=0.05,
                                          random_state=42, verbosity=0, eval_metric='mlogloss'), False),
}

results = {}
trained_models = {}

print(f'\n{"="*65}')
print(f'{"Model":<22} {"Accuracy":>10} {"F1":>8} {"Precision":>10} {"Recall":>8}')
print(f'{"="*65}')

for name, (model, use_scaled) in models.items():
    Xtr = X_train_s if use_scaled else X_train.values
    Xte = X_test_s  if use_scaled else X_test.values

    model.fit(Xtr, y_train)
    preds = model.predict(Xte)

    acc  = accuracy_score(y_test, preds)
    f1   = f1_score(y_test, preds, average='weighted', zero_division=0)
    prec = f1_score(y_test, preds, average='weighted', zero_division=0)
    rec  = f1_score(y_test, preds, average='weighted', zero_division=0)

    results[name] = {'accuracy': acc, 'f1': f1, 'preds': preds}
    trained_models[name] = (model, use_scaled)

    print(f'{name:<22} {acc:>10.2%} {f1:>8.4f} {prec:>10.4f} {rec:>8.4f}')

print(f'{"="*65}')
best = max(results, key=lambda k: results[k]['f1'])
print(f'\nüèÜ Best model: {best}  (F1 = {results[best]["f1"]:.4f})')

## 4. Model Comparison Chart

In [None]:
names  = list(results.keys())
accs   = [results[n]['accuracy'] * 100 for n in names]
f1s    = [results[n]['f1']       * 100 for n in names]
colors = ['#2563EB' if n == best else '#93C5FD' for n in names]

x = np.arange(len(names))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 5))
bars1 = ax.bar(x - width/2, accs, width, label='Accuracy (%)', color=colors,       alpha=0.85)
bars2 = ax.bar(x + width/2, f1s,  width, label='F1 Score (%)',  color='#059669', alpha=0.75)

for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
            f'{bar.get_height():.1f}%', ha='center', va='bottom', fontsize=9)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
            f'{bar.get_height():.1f}%', ha='center', va='bottom', fontsize=9)

ax.set_xticks(x)
ax.set_xticklabels(names, rotation=10)
ax.set_ylim(0, 85)
ax.set_ylabel('Score (%)')
ax.set_title('Model Comparison ‚Äî Accuracy vs F1 Score', fontsize=13, fontweight='bold')
ax.legend()
ax.axhline(50, color='red', linestyle='--', alpha=0.4, label='Random baseline (50%)')

# Highlight best model
best_idx = names.index(best)
ax.annotate(f'üèÜ Best', xy=(best_idx - width/2, accs[best_idx]),
            xytext=(best_idx - width/2 + 0.3, accs[best_idx] + 5),
            fontsize=10, color='#1B3A6B', fontweight='bold',
            arrowprops=dict(arrowstyle='->', color='#1B3A6B'))

plt.tight_layout()
plt.savefig('../data/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print('Chart saved to data/model_comparison.png')

## 5. Confusion Matrices ‚Äî All Models

In [None]:
label_names = ['DOWN', 'NEUTRAL', 'UP']
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
axes = axes.flatten()

for i, (name, res) in enumerate(results.items()):
    cm = confusion_matrix(y_test, res['preds'], labels=[-1, 0, 1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
    disp.plot(ax=axes[i], colorbar=False, cmap='Blues')
    axes[i].set_title(f'{name}\nAcc: {res["accuracy"]:.2%}',
                      fontweight='bold', fontsize=11)

# Hide the 6th empty subplot
axes[5].set_visible(False)

plt.suptitle('Confusion Matrices ‚Äî All Models', fontsize=14, fontweight='bold', y=1.01)
plt.tight_layout()
plt.savefig('../data/confusion_matrices.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Classification Report ‚Äî Best Model

In [None]:
best_model, use_scaled = trained_models[best]
Xte = X_test_s if use_scaled else X_test.values
best_preds = best_model.predict(Xte)

print(f'üìã Classification Report ‚Äî {best} (Best Model)')
print('='*50)
print(classification_report(y_test, best_preds,
                             target_names=label_names,
                             zero_division=0))

## 7. Feature Importance (Random Forest)

In [None]:
rf_model = trained_models['Random Forest'][0]
feature_names  = list(X_train.columns)
importances    = rf_model.feature_importances_

feat_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_df = feat_df.sort_values('Importance', ascending=True).tail(15)

fig, ax = plt.subplots(figsize=(10, 7))
colors = ['#2563EB' if imp > feat_df['Importance'].median() else '#93C5FD'
          for imp in feat_df['Importance']]
bars = ax.barh(feat_df['Feature'], feat_df['Importance'], color=colors, edgecolor='white')

for bar, val in zip(bars, feat_df['Importance']):
    ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
            f'{val:.4f}', va='center', fontsize=9)

ax.set_xlabel('Feature Importance')
ax.set_title('Top 15 Feature Importances (Random Forest)', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('../data/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nüîç Top 5 most important features:')
top5 = feat_df.tail(5)[['Feature', 'Importance']].iloc[::-1]
for _, row in top5.iterrows():
    print(f'  {row["Feature"]:<20}  {row["Importance"]:.4f}')

## 8. Final Results Summary Table

In [None]:
summary = pd.DataFrame([
    {
        'Model'    : ('üèÜ ' if n == best else '   ') + n,
        'Accuracy' : f"{results[n]['accuracy']:.2%}",
        'F1 Score' : f"{results[n]['f1']:.4f}",
        'Notes'    : {
            'Logistic Regression': 'Baseline ‚Äî simple & fast',
            'SVM':                 'Good boundary ‚Äî needs scaling',
            'KNN':                 'Instance-based ‚Äî needs scaling',
            'Random Forest':       'Best interpretability (feature importance)',
            'XGBoost':             'Best overall ‚Äî gradient boosting',
        }.get(n, '')
    }
    for n in results
])

print('üìä Final Model Comparison:')
print(summary.to_string(index=False))

print(f'\n‚úÖ Best Model Selected: {best}')
print(f'   ‚Üí Saved as models/best_model.pkl')
print(f'   ‚Üí Used by FastAPI /predict endpoint')
print(f'   ‚Üí Displayed in Streamlit dashboard')

## 9. Key Takeaways

In [None]:
print('='*55)
print('   ü§ñ MODEL TRAINING SUMMARY')
print('='*55)
print()
print('  Key Decisions Made:')
print('  ‚úÖ Chronological split ‚Äî no data leakage')
print('  ‚úÖ StandardScaler applied only on train set')
print('  ‚úÖ 5 models trained and compared')
print('  ‚úÖ Best model selected by F1-score (weighted)')
print()
print('  Why F1 over Accuracy?')
print('  ‚Üí Class imbalance means accuracy can be misleading')
print('  ‚Üí F1 balances precision and recall across all classes')
print()
print('  Interview Talking Points:')
print('  ‚Üí Compared 5 ML algorithms on same dataset')
print('  ‚Üí Used feature importance to interpret model')
print('  ‚Üí Avoided data leakage with time-ordered split')
print('  ‚Üí Selected best model programmatically by F1')
print()
print('  ‚û°Ô∏è  Next: Notebook 4 ‚Äî Sentiment Analysis')
print('='*55)