In [33]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [45]:
df = pd.read_stata("/kaggle/input/d/rupsha137/mutual-funds/net_f_2.dta")

In [46]:
df = df.dropna()
df = df[df["ann_net_sortino"] <= 10]
df = df[df["ann_net_sharpe_ratio"] <= 5.5]
df['a1'].nunique()

2906

In [47]:
columns_to_drop = [
    'net_st_info_ratio',
    'net_st_tracking_error',
    'gross_st_tracking_error',
    'gross_mk_tracking_error',
    'gross_st_info_ratio',
    'gross_mk_info_ratio',
    'ann_gross_sharpe_ratio',
    'ann_gross_sortino','obj1'
]

df = df.drop(columns=columns_to_drop)

In [48]:
# Normalize features
feature_cols = [col for col in df.columns if col not in ["a1", "y", "ann_net_rf_ret"]]
scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])

In [49]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


ols_preds = []
performance = []
ols_importance = []

for target_year in range(2005, 2022):
    train_years = list(range(target_year - 10, target_year))
    test_year = target_year

    train_data = df[df['y'].isin(train_years)]
    test_data = df[df['y'] == test_year]

    if train_data.empty or test_data.empty:
        continue

    X_train = train_data[feature_cols]
    y_train = train_data["ann_net_rf_ret"]
    X_test = test_data[feature_cols]
    y_test = test_data["ann_net_rf_ret"]
    fund_test = test_data["a1"]

    # Fit model
    ols = LinearRegression()
    ols.fit(X_train, y_train)
    preds = ols.predict(X_test)

    # Store predictions
    for i in range(len(preds)):
        ols_preds.append({
            'a1': fund_test.iloc[i],
            'year': test_year,
            'actual_value': y_test.iloc[i],
            'predicted_value': preds[i],
            'model': 'OLS'
        })

    # Store performance
    performance.append({
        'model': 'OLS',
        'year': test_year,
        'r2': r2_score(y_test, preds),
        'rmse': mean_squared_error(y_test, preds, squared=False),
        'mae': mean_absolute_error(y_test, preds)
    })

    # Store coefficient importance
    for i, col in enumerate(feature_cols):
        ols_importance.append({
            'model': 'OLS',
            'predictor': col,
            'importance': ols.coef_[i],
            'year': test_year
        })

# ---- Convert to DataFrames ----
ols_preds_df = pd.DataFrame(ols_preds)
ols_perf_df = pd.DataFrame(performance)
ols_imp_df = pd.DataFrame(ols_importance)

ols_preds_df.to_csv("/kaggle/working/ols_predictions.csv", index=False)
ols_perf_df.to_csv("/kaggle/working/ols_performance.csv", index=False)
ols_imp_df.to_csv("/kaggle/working/ols_feature_importance.csv", index=False)


In [None]:
import os

os.remove("/kaggle/working/ols_predictions.csv")
os.remove("/kaggle/working/ols_performance.csv")
os.remove("/kaggle/working/ols_feature_importance.csv")

In [50]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [51]:


# Replace the following as needed
# df = ... # your DataFrame
# feature_cols = [...] # your list of feature column names

rf_preds = []
performance = []
rf_importance = []

for target_year in range(2005, 2022):
    train_years = list(range(target_year - 10, target_year))
    test_year = target_year

    train_data = df[df['y'].isin(train_years)]
    test_data = df[df['y'] == test_year]

    if train_data.empty or test_data.empty:
        continue

    X_train = train_data[feature_cols]
    y_train = train_data["ann_net_rf_ret"]
    X_test = test_data[feature_cols]
    y_test = test_data["ann_net_rf_ret"]
    fund_test = test_data["a1"]

    # Fit Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)

    # Store predictions
    for i in range(len(preds)):
        rf_preds.append({
            'a1': fund_test.iloc[i],
            'year': test_year,
            'actual_value': y_test.iloc[i],
            'predicted_value': preds[i],
            'model': 'RandomForest'
        })

    # Store performance
    performance.append({
        'model': 'RandomForest',
        'year': test_year,
        'r2': r2_score(y_test, preds),
        'rmse': mean_squared_error(y_test, preds, squared=False),
        'mae': mean_absolute_error(y_test, preds)
    })

    # Store feature importances
    importances = rf.feature_importances_
    for i, col in enumerate(feature_cols):
        rf_importance.append({
            'model': 'RandomForest',
            'predictor': col,
            'importance': importances[i],
            'year': test_year
        })

# ---- Convert to DataFrames ----
rf_preds_df = pd.DataFrame(rf_preds)
rf_perf_df = pd.DataFrame(performance)
rf_imp_df = pd.DataFrame(rf_importance)

# Save to CSV
rf_preds_df.to_csv("/kaggle/working/rf_predictions.csv", index=False)
rf_perf_df.to_csv("/kaggle/working/rf_performance.csv", index=False)
rf_imp_df.to_csv("/kaggle/working/rf_feature_importance.csv", index=False)


KeyboardInterrupt: 

In [None]:
# ---- Initialize Lists ----
rf_preds = []
rf_perf = []
rf_importance = []

# ---- Hyperparameter Search Space ----
param_dist = {
    'n_estimators': randint(100, 200),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['auto', 'sqrt', 'log2']
}

# ---- Loop Over Years ----
for target_year in range(2005, 2022):
    train_years = list(range(target_year - 10, target_year))
    test_year = target_year

    train_data = df[df['y'].isin(train_years)]
    test_data = df[df['y'] == test_year]

    if train_data.empty or test_data.empty:
        continue

    X_train = train_data[feature_cols]
    y_train = train_data["ann_net_rf_ret"]
    X_test = test_data[feature_cols]
    y_test = test_data["ann_net_rf_ret"]
    fund_test = test_data["a1"]

    # ---- Hyperparameter Tuning ----
    base_rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    random_search = RandomizedSearchCV(
        base_rf,
        param_distributions=param_dist,
        n_iter=20,
        scoring='neg_mean_squared_error',
        cv=3,
        n_jobs=-1,
        random_state=42
    )
    random_search.fit(X_train, y_train)
    best_rf = random_search.best_estimator_

    # ---- Fit and Predict ----
    preds = best_rf.predict(X_test)

    # Store predictions
    for i in range(len(preds)):
        rf_preds.append({
            'a1': fund_test.iloc[i],
            'year': test_year,
            'actual_value': y_test.iloc[i],
            'predicted_value': preds[i],
            'model': 'Random Forest'
        })

    # Store performance
    rf_perf.append({
        'model': 'Random Forest',
        'year': test_year,
        'r2': r2_score(y_test, preds),
        'rmse': np.sqrt(mean_squared_error(y_test, preds)),
        'mae': mean_absolute_error(y_test, preds)
    })

    # Store feature importance
    for i, col in enumerate(feature_cols):
        rf_importance.append({
            'model': 'Random Forest',
            'predictor': col,
            'importance': best_rf.feature_importances_[i],
            'year': test_year
        })

    # ---- Progress Log ----
    print(f"âœ… Completed Random Forest for year: {test_year}")

# ---- Convert to DataFrames ----
rf_preds_df = pd.DataFrame(rf_preds)
rf_perf_df = pd.DataFrame(rf_perf)
rf_imp_df = pd.DataFrame(rf_importance)

# ---- Save to CSV ----
rf_preds_df.to_csv("/kaggle/working/rf_predictions.csv", index=False)
rf_perf_df.to_csv("/kaggle/working/rf_performance.csv", index=False)
rf_imp_df.to_csv("/kaggle/working/rf_feature_importance.csv", index=False)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint, uniform

gb_preds = []
gb_perf = []
gb_importance = []

# ---- Broad Param Distribution for RandomizedSearchCV ----
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': randint(3, 7),
    'learning_rate': uniform(0.03, 0.1),
    'subsample': uniform(0.7, 0.3)
}

# ---- Loop Over Years ----
for target_year in range(2005, 2022):
    train_years = list(range(target_year - 10, target_year))
    test_year = target_year

    train_data = df[df['y'].isin(train_years)]
    test_data = df[df['y'] == test_year]

    if train_data.empty or test_data.empty:
        continue

    X_train = train_data[feature_cols]
    y_train = train_data["ann_net_rf_ret"]
    X_test = test_data[feature_cols]
    y_test = test_data["ann_net_rf_ret"]
    fund_test = test_data["a1"]

    # ---- Randomized Search (Broad) ----
    base_gb = GradientBoostingRegressor(random_state=42)
    rand_search = RandomizedSearchCV(
        estimator=base_gb,
        param_distributions=param_dist,
        n_iter=10,
        cv=3,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        random_state=42
    )
    rand_search.fit(X_train, y_train)

    # ---- Narrow Grid Search (Fine-Tuning) ----
    best_params = rand_search.best_params_

    grid_param = {
        'n_estimators': [max(50, best_params['n_estimators'] - 20), best_params['n_estimators'], best_params['n_estimators'] + 20],
        'max_depth': [max(1, best_params['max_depth'] - 1), best_params['max_depth'], best_params['max_depth'] + 1],
        'learning_rate': [best_params['learning_rate'] * 0.8, best_params['learning_rate'], best_params['learning_rate'] * 1.2],
        'subsample': [max(0.6, best_params['subsample'] - 0.1), best_params['subsample'], min(1.0, best_params['subsample'] + 0.1)]
    }

    grid_search = GridSearchCV(
        estimator=base_gb,
        param_grid=grid_param,
        cv=3,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    best_gb = grid_search.best_estimator_

    # ---- Fit and Predict ----
    preds = best_gb.predict(X_test)

    # Store predictions
    for i in range(len(preds)):
        gb_preds.append({
            'a1': fund_test.iloc[i],
            'year': test_year,
            'actual_value': y_test.iloc[i],
            'predicted_value': preds[i],
            'model': 'Gradient Boosting'
        })

    # Store performance
    gb_perf.append({
        'model': 'Gradient Boosting',
        'year': test_year,
        'r2': r2_score(y_test, preds),
        'rmse': mean_squared_error(y_test, preds, squared=False),
        'mae': mean_absolute_error(y_test, preds)
    })

    # Store feature importance
    for i, col in enumerate(feature_cols):
        gb_importance.append({
            'model': 'Gradient Boosting',
            'predictor': col,
            'importance': best_gb.feature_importances_[i],
            'year': test_year
        })

# ---- Convert to DataFrames ----
gb_preds_df = pd.DataFrame(gb_preds)
gb_perf_df = pd.DataFrame(gb_perf)
gb_imp_df = pd.DataFrame(gb_importance)

# ---- Save to CSV ----
gb_preds_df.to_csv("/kaggle/working/gb_predictions.csv", index=False)
gb_perf_df.to_csv("/kaggle/working/gb_performance.csv", index=False)
gb_imp_df.to_csv("/kaggle/working/gb_feature_importance.csv", index=False)
