In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, ridge_regression, LassoCV, RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor


In [None]:
df = pd.read_csv("/sphenix/user/shuhangli/ppg12/FunWithxgboost/sig_takehome_daily.csv")
df['day'] = pd.to_datetime(df['day'])
df = df.sort_values('day')
df = df.set_index('day').sort_index()
print(np.corrcoef(df['rv_lag1'], df['target_var_20d']))

In [None]:
df.head()

In [None]:
target_col = ['target_var_20d']
fig, ax = plt.subplots(figsize = (9,4))
ax.plot(df.index, df['target_var_20d'], label='vol')

plt.show()

ax = df[target_col].plot(figsize=(9,4))



In [None]:
feat_cols = ['rv_lag1', 'rv_lag5', 'rv_lag22', 'realized_var', 'avg_spread_bps', 'avg_volume',	'avg_mid']
#feat_cols = ['rv_lag1', 'rv_lag5', 'rv_lag22']
ax = df[feat_cols].plot(figsize = (9,4))

In [None]:
corr = df[feat_cols+target_col].corr()

In [None]:
corr

In [None]:
train_len = 20
rows = []
tscv = TimeSeriesSplit(n_splits=5)
alphas = np.linspace(0.1,20,20)
for i in range(train_len, len(df)-1):
    train_df=df.iloc[i-train_len:i]
    train_X = train_df[feat_cols].values
    train_y = train_df[target_col[0]].values
    test_df = df.iloc[i:i+1]
    test_X = test_df[feat_cols].values
    test_y = test_df[target_col[0]].values
    #
    pipe = Pipeline([('scaler', StandardScaler()), ('fitter', LinearRegression())])

    r_cv = RidgeCV(cv = tscv, alphas=alphas)
    pipe_ridge = Pipeline([('scaler', StandardScaler()), ('fitter', r_cv)])
    
    
    pipe.fit(train_X, train_y)
    pred_y = pipe.predict(test_X)

    pipe_ridge.fit(train_X, train_y)
    pred_ridge_y = pipe_ridge.predict(test_X)

    used_alpha = pipe_ridge.named_steps['fitter'].alpha_

    rows.append(
        {
            'day': test_df.index[0],
            'target': test_y[0],
            'pred_ols': pred_y[0],
            'pred_ridge': pred_ridge_y[0],
            'ridge_alpha': used_alpha,
        }
        
    )

result_df = pd.DataFrame(rows)
    

In [None]:
result_corr = result_df[['target', 'pred_ols', 'pred_ridge']].corr()
result_corr

In [None]:

ax = result_df[['target', 'pred_ols', 'pred_ridge']].plot()

In [None]:
print(result_df['ridge_alpha'])

In [None]:
def r2(y, yhat):
    ybar = np.mean(y)
    return 1.0 - np.sum((y - yhat)**2) / np.sum((y - ybar)**2)

from sklearn.metrics import r2_score
print("R2_ols", r2(result_df['target'], result_df['pred_ols']))
print("R2_ridge", r2(result_df['target'], result_df['pred_ridge']))

In [None]:
vol_df = pd.read_csv("/sphenix/user/shuhangli/ppg12/FunWithxgboost/VIXCLS.csv")

In [None]:
print(vol_df.head())
vol_df['observation_date'] = pd.to_datetime(vol_df['observation_date'])
vol_df = vol_df.sort_values(by='observation_date')
vol_df.plot(x='observation_date', y='VIXCLS')

In [None]:
vol_df.isna().sum()
print(vol_df.loc[vol_df['VIXCLS'].isna()])
vol_df = vol_df.dropna()
vol_df.iloc[-1:]

In [None]:
ks = [1, 5, 22, 30]
target_length = 20
hs = [1,2,5]
def build_features(df, vol_col_feat = 'VIXCLS'):
    #build features
    for k in ks:
        df[f"vol_lag_{k}"] = df[vol_col].rolling(window = k).mean().shift(1)

    #build target
    df[f"target_{target_length}d"] = df[vol_col].rolling(window = target_length).mean().shift(-target_length)

    for h in hs:
        df[f"target_h{h}d"] = df[vol_col].shift(-h)
        

    return df.dropna()
        


In [None]:
df_processed = build_features(vol_df.copy())

In [None]:
df_processed.tail(100).plot(x='observation_date', y=['target_20d', 'vol_lag_1'])

In [None]:
def make_prediction(df):
    train_X = df[feat_cols].values
    train_y = df[target_col[0]].values

    pipe = Pipeline([('scaler', StandardScaler()), ('fitter', LinearRegression())])

    r_cv = RidgeCV(cv = tscv, alphas=alphas)
    pipe_ridge = Pipeline([('scaler', StandardScaler()), ('fitter', r_cv)])
    
    
    pipe.fit(train_X, train_y)
    pred_y = pipe.predict(test_X)

    pipe_ridge.fit(train_X, train_y)
    pred_ridge_y = pipe_ridge.predict(test_X)

    return pred_y[0], pred_y_ridge[0]
    

In [None]:
df = df_processed.copy()
feat_cols = ["vol_lag_1",	"vol_lag_5",	"vol_lag_22",	"vol_lag_30"]
target_col = ['target_h5d']
train_len = 20
rows = []
tscv = TimeSeriesSplit(n_splits=5)
alphas = np.linspace(0.1,20,20)
for i in range(train_len, len(df)-1):
    train_df=df.iloc[i-train_len:i]
    train_X = train_df[feat_cols].values
    train_y = train_df[target_col[0]].values
    test_df = df.iloc[i:i+1]
    test_X = test_df[feat_cols].values
    test_y = test_df[target_col[0]].values
    #
    pipe = Pipeline([('scaler', StandardScaler()), ('fitter', LinearRegression())])

    r_cv = RidgeCV(cv = tscv, alphas=alphas)
    pipe_ridge = Pipeline([('scaler', StandardScaler()), ('fitter', r_cv)])
    
    
    pipe.fit(train_X, train_y)
    pred_y = pipe.predict(test_X)

    pipe_ridge.fit(train_X, train_y)
    pred_ridge_y = pipe_ridge.predict(test_X)

    used_alpha = pipe_ridge.named_steps['fitter'].alpha_

    rows.append(
        {
            'day': test_df.index[0],
            'target': test_y[0],
            'pred_ols': pred_y[0],
            'pred_ridge': pred_ridge_y[0],
            'ridge_alpha': used_alpha,
        }
        
    )

result_df = pd.DataFrame(rows)
    

In [None]:
result_corr = result_df[['target', 'pred_ols', 'pred_ridge']].corr()
result_corr

In [None]:
ax = result_df[['target', 'pred_ols', 'pred_ridge']].plot(figsize=(8,4))

In [None]:
ax = result_df.iloc[-100:].plot(
    x='day',
    y=['target', 'pred_ols', 'pred_ridge'],
    figsize=(8,4)
)
ax.set_xlabel('day')
ax.set_ylabel('value')

In [None]:
def r2(y, yhat):
    ybar = np.mean(y)
    return 1.0 - np.sum((y - yhat)**2) / np.sum((y - ybar)**2)

from sklearn.metrics import r2_score
print("R2_ols", r2(result_df['target'], result_df['pred_ols']))
print("R2_ridge", r2(result_df['target'], result_df['pred_ridge']))

In [None]:
df.iloc[-1]

In [None]:
rng = np.random.default_rng(0)
xs = rng.normal(size = 200)
ys = xs*0.7 + rng.normal(scale = 0.5, size = 200)
fig, ax = plt.subplots()
ax.scatter(xs, ys, alpha = 0.7)
ax.set(title='scatter', xlabel="x", ylabel='y')


In [None]:
fig, ax = plt.subplots()
ax.hist(result_df['target'],bins = 30)

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("somepath.csv")

df = df.rename(columns = str.lower).drop_duplicates()

df.isna().mean().sort_values(ascending = False)
df = df.dropna(subset=['target'])
df['x'] = df['x'].fillna(df["x"].median())


df['date'] = pd.to_datetime(df['date'])
df['dow'] = df['date'].dt.dayofweel

cat = ['sector', 'region']
df[cat] = df[cat].astype('catagory')

#groupby

agg = df.groupby("user_id").agg({"amount":["mean", "sum", "count"]})

agg.columns = ["_".join(c) for c in agg.columns]

out = df.merge(agg, left_on = "user_id", right_index = True, how = "left")


H = 20

df["y"] = np.log(df['price'].shift(-H)) - np.log(df['price'])

df['ma_5'] = df['price'].rolling(5).mean().shift(1)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

num = ['age', 'income', 'ma_5']
cat = ['sector','region']

pre = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imp", SimpleImputer()),  ("sc", StandardScaler())    ]) , num      ),
        ("cat", Pipeline( [("imp", SimpleImputer(strategy="most_frequent")), ("oh", OneHotEncoder(handle_unknown="ignore"))   ]    )   ,cat)
        
    ],
    remainder = 'drop'
)

model = Pipeline([("pre", pre), ("est", Ridge(alpha = 1.0))])


In [None]:
from sklearn.model_selection import train_test_split, KFold, GroupKFold, TimeSeriesSplit

#iid

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#group

cv = GroupKFold(n_splits=5).split(X,y, groups = df['user_id'])

#ts split

cs = TimeSeriesSplit(n_splits=5)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score, average_precision_score, log_loss
rmse = mean_squared_error(y_true, y_pred, squared = False)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

scorer = make_scorer(mean_squared_error, greater_is_better=False, squared = False)
param_grid = {"est__alpha":[0.01, 0.1, 1, 10]}

gs = GridSearchCV(model, param_grid = param_grid, scoring=scorer, cv = 5, n_jobs=1)

gs.fit(X_train, y_train)
best = gs.best_estimator_


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

pipe = Pipeline([("pre", pre), ("est", Ridge(alpha=1.0)) ] )
pipe.fit(X_train, y_train)
rmse = mean_squared_error(y_test,pipe.predict(X_test), squared = False)


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
clf = Pipeline(
    [("pre", pre), ("est", LogisticRegression(max_iter = 1000,  class_weight = "balanced"))   ]
)
clf.fit(X_train, y_train)
p = clf.predict_proba(X_cal)[:,1]

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.05, subsample = 0.8, colsample_bytree = 0.8, tree_method = "hist")
params = {"max_depth": [3,4,6,8], "min_child_weight": [1,3,5], "reg_alpha": [0,1e-2, 1e-1, 1], "reg_lambda": [0.1,1,5]}
rs = RandomizedSearchCV(xgb,
                        params,
                        n_iter = 20,
                        scoring="neg_root_mean_squared_error",
                        cv = 5, 
                        n_jobs = -1,
                        random_state = 0
                       )

rs.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader


DEVICE = 'cuda' if torch.cuda.is_available() else "cpu"

torch.manual_seed(0)

np.random.seed(0)

num_cols = ['age', 'income', 'ma_5']
cat_cols = ['sector', 'region']
target_col = 'y'

X = df[num_cols + cat_cols]
y = df[target_col] = "y"

X_tr, X_te, y_re, y_te = train_test_split(X,y, test_size=0.2, random_state=0)

pre = ColumnTransformer(
    transformers=[
        ('num', Pipeline
         (
            [
                ('imp', SimpleImputer(strategy="median")),
                ('sc', StandardScaler())
            ]
        ), num_cols)
        ('cat', Pipeline(
            [
                ('imp', SimpleImputer(strategy="most_frequent")),
                ('oh', OneHotEncoder(handle_unknown='ignore'))
            ]
        ), cat_cols)
        
        
        
    ],
    remainder='drop'


)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# helper functions
def basic_info(df):
    print("Shape: ",df.shape)
    print("\nDtypes:\n", df.dtypes)
    print("\n Head: \n", df.head(3))
    print("\n Tail: \n", df.tail(3))

def missing_report(df):
    miss = df.isna().sum().sort_values(ascending = False)
    rate = df.isna().mean().sort_values(ascending = False)

    out = pd.DataFrame({'missing': miss, 'missing_rate': rate})
    return out[out['missing']>0]
def numeric_summary(df, num_cols):
    if not num_cols:
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    desc = df[num_cols].describe(percentiles = [.01, .05, .25, .5, .75,.95, .99]).T
    desc['skew'] = df[num_cols].skew(numeric_only= True)
    desc['kurt'] = df[num_cols].kurt(numeric_only= True)
    return desc


def categorical_summary(df, topn = 10):
    cats = df.select_dypes(include=['object', 'category']).columns
    res = {}
    for c in cars:
        vc = df[c].value_counts(dropna = False).head(topn)
        res[c] = vc

    return res



def corr_top_pairs(df):
    num = df.select_dtypes(include=[np.number])
    c = num.corr()
    return c


def outlier_bounds(s, q=0.01):
    lo, hi = s.quantile(q), s.quantile(1-q)
    return lo, hi

def ensure_datetime_index(df, data_col = "DATE"):
    g = df.copy()

    g[date_col] = pd.to_datetime(g[date_col])
    g = g.sort_values(date_col)
    g = g.set_index(data_col)

#plotting helpers

def plot_timeseries(df, ycols, title = None):
    fig, ax = plt.subplots(figsize = (8,4))

    for c in ycols:
        ax.plot(df.index, df_ts[c], label = c)
    ax.set(xlabel = "date", ylabel = "value", tital = title)
    ax.legend()
    fig.tight_layout()

def plot_hist(df, cols, bins = 40):
    fig, ax  = plt.subplots(figsize = (7,4))

    for c in cols:
        ax.hist(df[c].dropna().values, bins=bins, alpha = 0.5, label = c)
    ax.set(xlabel = "val", ylabel = "counts")
    ax.legend()
    fig.tight_layout()

def plot_scatter(df, x, y):
    fig, ax = plt.subplots(figsize(5,4))
    ax.scatter(df[x],df[y], alpha = 0.7)
    ax.set(xlabel = x, ylabel = y)
    fig.tight_layout()

def plot_correlation_heatmap(df, topk = 12):
    num = df.select_dtypes(include = [np.number])
    c = num.corr()

    cols = num.var().sort_values(ascending = False).head(topk).index

    sub = c.loc[cols, cols]

    fig, ax = plt.subplots(figsize = (6,5))

    im = ax.imshow(sub.values, aspect = "auto")
    fig.colorbar(im, ax = ax)
    fig.tight_layout()


    
    
    
        

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dataclasses import dataclass

from sklearn.model_selection import train_test_split, KFold, cross_validate, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LarsCV, ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.compose import TransformedTargetRegressor
from xgboost import XGBRegressor

##config

@dataclass
class SplitConfig:
    test_size:float = 0.2
    random_state: int = 42
@dataclass
class CVConfig:
    n_splits: int = 5
    random_state: int = 42
    shuffle: bool = True


##load and split


def load_and_split(df, target, ignore = None, split_config = SplitConfig()):
    ignore = ignore or []

    X = df.drop(columns = [c for c in ignore+[target]])
    y = df[target].astype(float)

    num_cols = X.select_dtypes(include = [np.number]).columns.tolist()
    cat_cols = X.select_dtypes(exclude = [np.number]).columns.tolist()

    X_tr, X_te, y_tr, y_te = train_test_split(
        X,y, test_size = split_config.test_size, random_state = split_config.random_state
    )

    return X_tr, X_te, y_tr, y_te, num_cols, cat_cols

#generic preprocessor builder
def make_preprocessor(num_cols, cat_cols):
    num_pre = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler()),
        ])

    cat_pre = Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder()),
        ])

    return ColumnTransformer(
        transformers=[
            ("num", num_pre, num_cols),
            ("cat", cat_pre, cat_cols),
            
        ],
        remainder="drop"
    )

def make_models(pre, seed):
    return {
        "OLS": Pipeline([('pre', pre), ('mdl', LinearRegression())]),
        "Ridge": Pipeline([('pre', pre), ('mdl', Ridge(alpha=1.0, random_state=seed)  )]),
        "Lasso": Pipeline([('pre', pre), ('mdl', Lasso(alpha = 0.01, random_state=seed)  )]),
        "ElasticNet": Pipeline([('pre', pre), ('mdl', ElasticNet(alpha=0.01, l1_ratio=0.5, random_state = seed)  )]),
        "RF": Pipeline([('pre', pre), ('mdl', RandomForestRegressor(n_estimators=300, random_state = seed)      )]),
        "XGB": Pipeline([('pre', pre), ('mdl', XGBRegressor(random_state = seed, learning_rate = 0.05, n_estimators = 100, max_depth = 5))]),
        
    }

def cv_compare(models, X, y, cv_cfg = CVConfig):
    cv = KFold(n_splits = cv_cfg.n_splits, shuffle = cv_cfg.shuffle, random_state = cv_cfg.random_state)
    scoring = {"MAE": "neg_mean_absolute_error", "RMSE": "neg_root_mean_squared_error", "R2": "r2"}
    rows = []

    for name, pipe in models.items():
        print(f"processing model {name}")
        res = cross_validate(pipe, X, y, cv = cv, scoring = scoring, return_train_score=False, n_jobs=-1)
        rows.append({
            "model": name,
            "MAE_CV": -res["test_MAE"].mean(),
            "RMSE_CV": -res["test_RMSE"].mean(),
            "R2_CV": res["test_R2"].mean(),
            "MAE_CV_std": res["test_MAE"].std(),
            "RMSE_CV_std": res["test_RMSE"].std(),
            "R2_CV_std": res["test_R2"].std(),
        })

    return pd.DataFrame(rows).sort_values("RMSE_CV").reset_index(drop = True)


def eval_models(models, X, y):
    rows = []
    for name, pipe in models.items():
        y_hat = pipe.predict(X)
        metric_dict = regression_metrics(y, y_hat)

        metric_dict['model'] = name

        rows.append(metric_dict)
    return pd.DataFrame(rows)
    
def make_param_grid(model_name):
    if model_name == "Ridge":
        return {"mdl__alpha": np.logspace(-4,3,20).tolist()}

    if model_name == "ElasticNet":
        return {"mdl__alpha": np.logspace(-4,3,20).tolist(),
                "mdl__l1_ratio": np.linspace(0.1,0.9,9).tolist()
               }
    if model_name == "RF":
        return{
            "mdl__n_estimators": [200, 300, 500],
            "mdl__max_depth": [None, 10, 20],
            "mdl__min_samples_leaf": [1,25]
        }

    if model_name == "XGB":
        return {
            "mdl__n_estimators": [50, 100, 400],
            "mdl__learning_rate": [0.01, 0.03, 0.1],
            "mdl__max_depth": [3,5,7],
            "mdl__min_child_weight": [1,3,5],
            "mdl__subsample": [0.7, 0.85, 1.0],
            "mdl__reg_alpha": [0.0, 1e-3, 1e-2, 0.1],
            "mdl__reg_lambda": [0.1, 1.0, 5.0],
            "mdl__gamma": [0.0, 0.1, 1.0],
                        
        }
    
    return {}


def randomized_tune(model_name, model, X, y, cv_cfg = CVConfig(), n_iter = 20, seed = 42):
    grid = make_param_grid(model_name)
    if not grid:
        model.fit(X,y)
        return model

    cv = KFold(n_splits = cv_cfg.n_splits, shuffle = cv_cfg.shuffle, random_state = cv_cfg.random_state)
    rs = RandomizedSearchCV(
        estimator = model,
        param_distributions=grid,
        n_iter = n_iter,
        scoring="neg_root_mean_squared_error",
        cv = cv,
        random_state = seed,
        n_jobs=-1
    )

    rs.fit(X,y)
    print(f"{model_name}: best params:", rs.best_params_)
    return rs.best_estimator_


def regression_metrics(y_true, y_pred):
    return{
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": mean_squared_error(y_true, y_pred, squared = False),
        "R2": r2_score(y_true, y_pred),
        
    }

def residual_plot(y_hat, y_true):
    res = y_true - y_hat
    fig, ax = plt.subplots(figsize(6,5))
    ax.scatter(y_hat, res, alpha = 0.6)
    ax.set(xlabel = "predicted", ylabel = "residual")
    fig.tight_layout()
    
def permutation_importance_df(fitted_model, X, y, n_repeats = 10, seed = 42):
    perm = permutation_importance(fitted_model, X,y, n_repeats = n_repeats, n_jobs=-1, random_state=seed)
    feat_names = X.columns.to_list()
    return pd.DataFrame({'feature': feat_names, "importance": perm.importances_mean}).sort_values("importance", ascending = False)

    

In [None]:
df = pd.read_csv("/sphenix/user/shuhangli/ppg12/FunWithxgboost/winequality-data.csv")

In [None]:
basic_info(df)

In [None]:
missing_report(df)

In [None]:
df.columns.to_list()

In [None]:
df.describe()

In [None]:
 X_tr, X_te, y_tr, y_te, num_cols, cat_cols = load_and_split(df, target='quality', ignore=['id'])

In [None]:
pre = make_preprocessor(num_cols,cat_cols)
models = make_models(pre, seed = 42)
cv_result = cv_compare(models, X_tr, y_tr)

In [None]:
print(cv_result)

In [None]:
best_models = {}
for model_name, model in models.items():
    best_models[model_name] = randomized_tune(model_name, model, X_tr, y_tr)

In [None]:
best_model_cv_result = cv_compare(best_models, X_tr, y_tr)

In [None]:
print(best_model_cv_result)

In [None]:
rf_perm_result = permutation_importance_df(best_models['RF'],X_te, y_te) 

In [None]:
rf_perm_result.plot(x = 'feature', y = 'importance')
rf_perm_result.head(10)

In [None]:
plot_hist(df, ['alcohol'])

In [None]:
 X_tr, X_te, y_tr, y_te, num_cols, cat_cols = load_and_split(df, target='quality', ignore=['id'])