# End-to-End ML Pipeline: Smarket Dataset

**Author:** Your Name

This notebook demonstrates a complete ML workflow for the **Smarket** dataset (1990-2001). It includes data loading, EDA, PCA/t-SNE, pipelines, TimeSeries CV, GridSearch including KNN, model evaluation, interpretation, calibration, and model saving.


## 1) Setup & imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, roc_curve, auc)
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

np.random.seed(42)
print('imports done')


## 2) Load dataset

In [None]:
url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ISLR/Smarket.csv"
df = pd.read_csv(url, index_col=0)
print(df.shape)
df.head()


## 3) Inspect & Preprocess

In [None]:
df.info()
df.describe().T

# Binary target and time split

df['Direction_bin'] = (df['Direction']=='Up').astype(int)
train = df[df['Year'] <= 2000].copy()
test = df[df['Year'] == 2001].copy()
features = ['Lag1','Lag2','Lag3','Lag4','Lag5','Volume']
X_train = train[features].copy(); y_train = train['Direction_bin'].copy()
X_test = test[features].copy(); y_test = test['Direction_bin'].copy()
print('Train/Test shapes:', X_train.shape, X_test.shape)


## 4) EDA

In [None]:
sns.pairplot(train[features + ['Direction']], hue='Direction', corner=True, plot_kws={'s':20})
plt.show()
plt.figure(figsize=(6,4))
sns.heatmap(train[features].corr(), annot=True, cmap='coolwarm')
plt.show()


## 5) PCA & t-SNE

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

pca2 = PCA(n_components=2, random_state=42).fit_transform(X_train_scaled)
plt.figure(figsize=(5,4)); plt.scatter(pca2[:,0], pca2[:,1], c=y_train, cmap='coolwarm'); plt.title('PCA'); plt.show()

from sklearn.manifold import TSNE
tsne2 = TSNE(n_components=2, random_state=42, perplexity=30).fit_transform(X_train_scaled)
plt.figure(figsize=(5,4)); plt.scatter(tsne2[:,0], tsne2[:,1], c=y_train, cmap='coolwarm'); plt.title('t-SNE'); plt.show()


## 6) Baseline models: Logistic & KNN

In [None]:
pipe_lr = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=1000, random_state=42))])
pipe_knn = Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier())])

pipe_lr.fit(X_train, y_train)
pipe_knn.fit(X_train, y_train)

print('Logistic test acc:', accuracy_score(y_test, pipe_lr.predict(X_test)))
print('KNN test acc (k=5):', accuracy_score(y_test, pipe_knn.predict(X_test)))

print('\nLogistic classification report:\n', classification_report(y_test, pipe_lr.predict(X_test)))
print('\nKNN classification report:\n', classification_report(y_test, pipe_knn.predict(X_test)))


## 7) TimeSeriesSplit CV (demonstrate leakage concerns)

In [None]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
for i, (tr, val) in enumerate(tscv.split(X_train)):
    print('Split', i, 'train indices', tr.min(), tr.max(), 'val indices', val.min(), val.max())

print('Logistic CV acc:', cross_val_score(pipe_lr, X_train, y_train, cv=tscv).mean())
print('KNN CV acc:', cross_val_score(pipe_knn, X_train, y_train, cv=tscv).mean())


## 8) Time-aware GridSearch (Logistic, KNN, RF, GB)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())])
param_grid = [
    {'clf': [LogisticRegression(max_iter=1000)], 'clf__C':[0.01,0.1,1,10]},
    {'clf': [KNeighborsClassifier()], 'clf__n_neighbors':[3,5,7], 'clf__weights':['uniform','distance']},
    {'clf': [RandomForestClassifier(random_state=42)], 'clf__n_estimators':[50,200], 'clf__max_depth':[3,5,None]},
    {'clf': [GradientBoostingClassifier(random_state=42)], 'clf__n_estimators':[50,200], 'clf__learning_rate':[0.01,0.1]}
]

ts = TimeSeriesSplit(n_splits=5)
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, param_grid=param_grid, cv=ts, scoring='accuracy', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print('Best params:', grid.best_params_)
print('Best CV score:', grid.best_score_)


## 9) Evaluate best model on test set (2001)

In [None]:
best = grid.best_estimator_
print('Best estimator:', best)

y_pred_test = best.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

if hasattr(best, 'predict_proba') or hasattr(best.named_steps['clf'], 'predict_proba'):
    proba = best.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, proba)
    roc_auc = auc(fpr, tpr)
    print('Test AUC:', roc_auc)
    plt.plot(fpr, tpr, label=f'AUC={roc_auc:.3f}'); plt.plot([0,1],[0,1],'k--'); plt.legend(); plt.show()
else:
    print('No predict_proba available for test ROC')

sns.heatmap(confusion_matrix(y_test, y_pred_test), annot=True, fmt='d'); plt.title('Confusion matrix (test)'); plt.show()


## 10) Feature importance / coefficients

In [None]:
clf = grid.best_estimator_.named_steps['clf']
if hasattr(clf, 'coef_'):
    coefs = clf.coef_.ravel()
    print('Coefficients:')
    for f, c in zip(features, coefs): print(f, c)
    plt.bar(features, coefs); plt.title('Logistic coefficients'); plt.show()
elif hasattr(clf, 'feature_importances_'):
    imps = clf.feature_importances_
    for f, imp in zip(features, imps): print(f, imp)
    plt.bar(features, imps); plt.title('Feature importances'); plt.show()
else:
    print('No coef_ or feature_importances_ available for this classifier.')


## 11) Calibration (Brier score & curve)

In [None]:
from sklearn.calibration import calibration_curve
if hasattr(best, 'predict_proba'):
    probs = best.predict_proba(X_test)[:,1]
    brier = ((probs - y_test) ** 2).mean()
    print('Brier score:', brier)
    prob_true, prob_pred = calibration_curve(y_test, probs, n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o'); plt.plot([0,1],[0,1],'k--'); plt.title('Calibration curve'); plt.show()
else:
    print('No predict_proba for calibration')


## 12) Save model and example predict script

In [None]:
joblib.dump(grid.best_estimator_, '/mnt/data/smarket_best_pipeline.joblib')
print('Saved pipeline to /mnt/data/smarket_best_pipeline.joblib')

# Example usage
model = joblib.load('/mnt/data/smarket_best_pipeline.joblib')
print('Loaded model:', model)
print('Example pred:', model.predict(X_test.head()))
