### Machine Learning in Finance
- Sheida Majidi

In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit
from joblib import Parallel, delayed

In [3]:
factors_char_list = pd.read_csv('/Users/sheidamajidi/Desktop/Summer2024/FINE695/HW1/data/factors_char_list.csv')
sample_big = pd.read_csv('/Users/sheidamajidi/Desktop/Summer2024/FINE695/HW1/data/homework_sample_big.csv')

### Data preprocessing

In [5]:
missing_values = sample_big.isnull().sum()
print("Missing values per column:\n", missing_values)

Missing values per column:
 date              0
ret_eom           0
permno            0
stock_ret         0
mspread           0
               ... 
qmj_prof         17
qmj_growth     7653
qmj_safety        3
rf                0
stock_exret       0
Length: 159, dtype: int64


In [47]:
# handle missing values (approach: weighted bagging imputation)

def simple_bagging_impute(model, X_train, y_train, X_missing, n_iter=50, random_state=1234):
    rng = np.random.RandomState(random_state)
    n = X_train.shape[0]
    index = list(range(n))
    
    predictions = np.zeros((n_iter, X_missing.shape[0]))
    
    for i in range(n_iter):
        # resample the data
        resample_ind = rng.choice(index, size=n, replace=True)
        x_train_resampled = X_train[resample_ind]
        y_train_resampled = y_train[resample_ind]

        # fit the model with resampled data
        model.fit(x_train_resampled, y_train_resampled)

        # predict missing values
        predictions[i] = model.predict(X_missing)
    
    return predictions.mean(axis=0)

# handle missing values - bagging approach
def fill_missing_values_with_bagging(data, columns, factors_char_list, n_iter=50):
    filled_data = data.copy()
    
    # imputer for predictors
    imputer = SimpleImputer(strategy='mean')

    for column in columns:
        missing_indices = filled_data[column].isnull()
        if missing_indices.any():

            X_train = filled_data.loc[~missing_indices, factors_char_list['variable']]
            X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

            y_train = filled_data.loc[~missing_indices, column].values

            X_missing = filled_data.loc[missing_indices, factors_char_list['variable']]
            X_missing = pd.DataFrame(imputer.transform(X_missing), columns=X_missing.columns)

            y_pred = simple_bagging_impute(LinearRegression(), X_train.values, y_train, X_missing.values, n_iter=n_iter)
            filled_data.loc[missing_indices, column] = y_pred
    
    return filled_data

In [48]:
sample_big_filled = fill_missing_values_with_bagging(sample_big, sample_big.columns, factors_char_list)
print(sample_big_filled.isnull().sum())

date           0
ret_eom        0
permno         0
stock_ret      0
mspread        0
              ..
qmj_prof       0
qmj_growth     0
qmj_safety     0
rf             0
stock_exret    0
Length: 159, dtype: int64
