In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
import pickle

In [2]:
def subset_stock_data(data, start_date, end_date, verbose=False):
    """
    Subsets the given dataframe based on a specified date range.

    Args:
        data (pandas.DataFrame): The dataframe containing the stock data.
        start_date (str or pandas.Timestamp): The start date of the desired date range.
        end_date (str or pandas.Timestamp): The end date of the desired date range.
        verbose (bool, optional): If True, prints a success message. Defaults to False.

    Returns:
        pandas.DataFrame: The subset of the dataframe based on the specified date range.
        
    Raises:
        ValueError: If the dataframe does not contain a 'Date' column.

    """
    # Check if 'Date' column exists in the dataframe
    if 'date' not in data.columns:
        raise ValueError("DataFrame does not contain a 'Date' column.")
     
    # Convert date columns to datetime if they are not already datetime objects
    if not isinstance(data['date'], pd.DatetimeIndex):
        data['date'] = pd.to_datetime(data['date'])
    
    if not isinstance(start_date, pd.Timestamp):
        start_date = pd.to_datetime(start_date)

    if not isinstance(end_date, pd.Timestamp):
        end_date = pd.to_datetime(end_date)

    # Subset the dataframe based on date range
    subset = data[(data['date'] >= start_date) & (data['date'] <=end_date)]
    if verbose:
        print(f'Successfully subsetted data from {start_date} to {end_date}.')
    return subset

In [3]:
def remove_non_numerical_columns(data, verbose=False):
    """
    Remove non-numerical columns from a dataframe.

    Parameters:
    - data: pandas DataFrame
        The input dataframe from which non-numerical columns will be removed.
    - verbose: bool, optional
        If True, print a message with the deleted columns. Default is False.

    Returns:
    - pandas DataFrame
        The dataframe with non-numerical columns removed.
    """

    # Check first 10 rows for numerical columns
    first_10_rows = data.head(10)
    non_numerical_columns = []

    # Iterate through columns
    for column in data.columns:
        # Check if the column contains numerical data
        if pd.api.types.is_numeric_dtype(first_10_rows[column]):
            continue
        else:
            non_numerical_columns.append(column)

    # Remove non-numerical columns from the dataframe
    data = data.copy()
    data.drop(columns=non_numerical_columns, inplace=True)

    # drop date column if it exists
    if 'date' in data.columns:
        data.drop(columns=['date'], inplace=True)

    # Print message with deleted columns
    if verbose:
        if non_numerical_columns:
            print("Successfully removed columns with non-numerical values:", non_numerical_columns)

    return data


In [4]:
def preprocess_data(data, interpolate = False, start_date=None, end_date=None, verbose=False):
    """
    Preprocesses the input data by performing the following steps:
    1. Subset the data based on the specified start and end dates.
    2. Remove non-numerical columns from the subsetted data.
    3. Scale the numerical data using StandardScaler.

    Args:
        data (pd.DataFrame): The input data to be preprocessed.
        start_date (str, optional): The start date for subsetting the data. Defaults to None.
        end_date (str, optional): The end date for subsetting the data. Defaults to None.
        verbose (bool, optional): Whether to print verbose output. Defaults to False.

    Returns:
        np.ndarray: The preprocessed and scaled data.
    """
    if type(data) != pd.DataFrame:
        raise Exception('data must be a pandas dataframe')
    
    subset_data = subset_stock_data(data, start_date, end_date, verbose=verbose)
    subset_numerical_data = remove_non_numerical_columns(subset_data, verbose=verbose)

    if interpolate:
        subset_numerical_data = subset_numerical_data.interpolate(limit_direction='forward')

    if 'permno' in subset_numerical_data.columns:
        subset_numerical_data = subset_numerical_data.drop(columns=['permno'])

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(subset_numerical_data)

    # make a DataFrame with the scaled data
    scaled_data = pd.DataFrame(scaled_data, columns=subset_numerical_data.columns)
        
    if verbose:
        print('Successfully scaled data.')
        
    return scaled_data

In [5]:
def find_q(explained_variance, required_explained_var = 0.95):
    """
    Finds the minimum number of principal components (q) required to explain a given amount of variance.

    Parameters:
    explained_variance (list): A list of explained variances for each principal component.
    required_explained_var (float): The required amount of variance to be explained (default is 0.95).

    Returns:
    int: The minimum number of principal components required to explain the given amount of variance.
    """
    cumulative_expl_var = np.cumsum(explained_variance)
    for i,j in enumerate(cumulative_expl_var):
        if j >= required_explained_var:
            q = i+1
            break 
    return q

In [6]:
def fit_pca(data):
    pca = PCA()
    pca.fit(data)
    return [pca.explained_variance_ratio_, pca.components_]

In [7]:
def fit_pfa(data, principal_components, q, diff_n_features):
    """
    Perform feature selection using Principal Feature Analysis (PFA).

    Parameters:
    - data: numpy array
        The input data matrix.
    - principal_components: numpy array
        The principal components obtained from PCA.
    - q: int
        The number of principal components to consider.
    - diff_n_features: int
        The difference between the number of features to select and the number of principal components.

    Returns:
    - indices: list
        The indices of the selected features.
    - features: numpy array
        The selected features from the input data matrix.
    """
    A_q = principal_components.T[:,:q]
    clusternumber = min([q + diff_n_features, data.shape[1]])
        
    kmeans = KMeans(n_clusters = clusternumber).fit(A_q)
    clusters = kmeans.predict(A_q)
    cluster_centers = kmeans.cluster_centers_

    dists = defaultdict(list)
    for i, c in enumerate(clusters):
        dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
        dists[c].append((i, dist))

    indices = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
    features = data[:, indices]
    return indices, features

In [8]:
def transform_pca(data, fitted, principal_components, q, preprocess_data=None):
    """
    Transforms the input data using Principal Component Analysis (PCA).

    Args:
        data (array-like): The input data to be transformed.
        fitted (bool): Indicates whether the PCA model has been fitted to the data.
        principal_components (array-like): The principal components obtained from the PCA model.
        q (int): The number of principal components to keep in the transformed data.
        preprocess_data (function, optional): A function to preprocess the data before transformation.

    Returns:
        array-like: The transformed data with reduced dimensions.

    Raises:
        Exception: If the model has not been fitted to the data.
    """
    if preprocess_data is not None:
        scaled_data = preprocess_data(data)
    else:
        scaled_data = data

    if not fitted:
        raise Exception('The model has not been fitted to the data.')

    reduced_data = np.matmul(np.array(scaled_data), np.transpose(principal_components))[:, :q]
    return reduced_data

def transform_pfa(data, fitted, features, preprocess_data=None):
    if preprocess_data != None:
        scaled_data = preprocess_data(data)
    else:
        scaled_data = data

    if fitted != True:
        raise Exception('The model has not been fitted to the data.')
    return features

In [9]:
def fit_transform(data, method):
    """
    Applies feature selection to the input data using the specified method.

    Args:
        data (numpy.ndarray): The input data to be transformed.
        method (str): The feature selection method to be used. Must be either 'pca' or 'pfa'.

    Returns:
        numpy.ndarray: The transformed data after applying feature selection.

    Raises:
        Exception: If the method is not 'pca' or 'pfa'.
    """
    if method not in ['pca', 'pfa']:
        raise Exception("Method must be either 'pca' or 'pfa'")
    scaled_data = preprocess_data(data)
    if method == 'PCA':
        explained_variance, principal_components = fit_pca(scaled_data)
        q = find_q(explained_variance)
        output = transform_pca(scaled_data, True, principal_components, q)
    elif method == 'PFA':
        explained_variance, principal_components = fit_pca(scaled_data)
        q = find_q(explained_variance)
        diff_n_features = 0
        indices, features = fit_pfa(scaled_data, principal_components, q, diff_n_features)
        output = transform_pfa(scaled_data, True, features)
    return output

In [10]:
# open data
stationary_data = pd.read_csv('../../data/datasetlabel.csv')

In [13]:
# drop 'target' column
stationary_data = stationary_data.drop(columns=['target'])

In [14]:
nan_threshold = 0.07

# find the number of nan values in each column
nan_values = stationary_data.isna().sum()

# find the columns with more than 7% nan values
columns_to_drop = nan_values[nan_values > (nan_threshold * stationary_data.shape[0])].index

# drop the columns with more than 7% nan values
print(f'dropping {np.sum(columns_to_drop)} columns:\n', columns_to_drop)
stationary_data = stationary_data.drop(columns=columns_to_drop)

dropping naics_processedret_industry_totret_industry_relative columns:
 Index(['naics_processed', 'ret_industry_tot', 'ret_industry_relative'], dtype='object')


In [15]:
# set the print limit to 100
pd.set_option('display.max_rows', 1000)

# print the number of missing values in each column
print(f'Missing values per column: \n', stationary_data.isnull().sum())
print('\n')
print('Total number of missing values: ', stationary_data.isnull().sum().sum())

# count the number of rows with missing values
print('Number of rows with missing values: ', stationary_data.isnull().any(axis=1).sum())

Missing values per column: 
 permno                 0
CAPEI              17900
bm                103417
evm                13604
pe_op_basic        52880
pe_op_dil          53402
pe_exi             53502
pe_inc             53320
ps                  2406
pcf                 5529
npm                 2413
opmbd               2411
opmad               2410
gpm                 2596
ptpm                2414
cfm                18029
roa                 5816
roe               107309
roce               18305
aftret_eq           5611
aftret_invcapx     42299
aftret_equity       5613
GProf               2269
equity_invcap       3608
debt_invcap        13331
totdebt_invcap     14395
capital_ratio      11613
cash_lt             4600
debt_at            12683
debt_ebitda        24396
short_debt        176512
lt_debt            11611
cash_debt          24631
fcf_ocf           149034
lt_ppent           38389
dltt_be           107077
debt_assets         4604
debt_capital       29791
de_ratio            4

In [16]:
# drop rows with missing values
stationary_data_full = stationary_data.dropna()

In [17]:
def find_features(data, start_date, period_duration, periods, explained_variance_threshold=0.95, diff_n_features=2):
    start_date = pd.to_datetime(start_date)

    assert data.isnull().sum().sum() == 0, 'Data contains missing values.'

    # create a dictionary to store the features
    features_dict = {}
    
    for i in range(periods):
        new_start_date = start_date + pd.DateOffset(years=(period_duration*i))
        end_date = new_start_date + pd.DateOffset(years=period_duration) - pd.DateOffset(days=1)

        # preprocess the data
        scaled_data = preprocess_data(data.copy(), start_date=new_start_date, end_date=end_date, verbose=True)

        # fit the pca model
        explained_variance, principal_components = fit_pca(scaled_data)

        # find the number of principal components to explain the variance threshold
        q = find_q(explained_variance, explained_variance_threshold)
        print(f'Number of principal components to explain {explained_variance_threshold*100}% of the variance: {q}')

        # fit the pfa model
        indices, features = fit_pfa(np.array(scaled_data), principal_components, q, diff_n_features)

        # find the list of features
        features_list = list(scaled_data.columns[indices])

        # store the features in the dictionary
        features_dict[new_start_date] = features_list

        print(f'Succesfully extracted features for period starting in {new_start_date}.\n')

    return features_dict

In [18]:
start_date = '2008-01-01' # start_date of the first period
period_duration = 2 # duration of each period in years
periods =  6 # number of periods
explained_variance_threshold = 0.8 # threshold for explained variance
diff_n_features = 2 # difference between the number of features to select and the number of principal components

features = find_features(stationary_data_full, start_date, period_duration, periods, explained_variance_threshold, diff_n_features)

# save the features as a dictionary with pickle
with open('../../data/selected_features.pkl', 'wb') as f:
    pickle.dump(features, f)

Successfully subsetted data from 2008-01-01 00:00:00 to 2009-12-31 00:00:00.
Successfully removed columns with non-numerical values: ['date']
Successfully scaled data.
Number of principal components to explain 80.0% of the variance: 24
Succesfully extracted features for period starting in 2008-01-01 00:00:00.

Successfully subsetted data from 2010-01-01 00:00:00 to 2011-12-31 00:00:00.
Successfully removed columns with non-numerical values: ['date']
Successfully scaled data.
Number of principal components to explain 80.0% of the variance: 24
Succesfully extracted features for period starting in 2010-01-01 00:00:00.

Successfully subsetted data from 2012-01-01 00:00:00 to 2013-12-31 00:00:00.
Successfully removed columns with non-numerical values: ['date']
Successfully scaled data.
Number of principal components to explain 80.0% of the variance: 23
Succesfully extracted features for period starting in 2012-01-01 00:00:00.

Successfully subsetted data from 2014-01-01 00:00:00 to 2015-12-

In [19]:
# load the features from the pickle file
with open('../../data/selected_features.pkl', 'rb') as f:
    features_dict = pickle.load(f)

for key, value in features_dict.items():
    print(f'Selected features for period starting in: {key}:')
    print(value)
    print('')

Selected features for period starting in: 2008-01-01 00:00:00:
['CAPEI', 'equity_invcap', 'evm', 'pe_op_basic', 'pe_inc', 'ps', 'ptpm', 'roa', 'roe', 'aftret_equity', 'aftret_invcapx', 'GProf', 'capital_ratio', 'short_debt', 'fcf_ocf', 'lt_ppent', 'at_turn', 'rect_turn', 'pay_turn', 'adv_sale', 'stat_divyeld', 'prc', 'vol', 'retx', 'mktcap', 'MACD_index']

Selected features for period starting in: 2010-01-01 00:00:00:
['CAPEI', 'npm', 'debt_ebitda', 'pe_op_dil', 'pe_exi', 'rd_sale', 'pcf', 'opmbd', 'roce', 'aftret_equity', 'adv_sale', 'cash_lt', 'debt_at', 'debt_assets', 'short_debt', 'fcf_ocf', 'dltt_be', 'de_ratio', 'sale_invcap', 'rect_turn', 'pay_turn', 'stat_divyeld', 'prc', 'vol', 'retx', 'prc_adj']

Selected features for period starting in: 2012-01-01 00:00:00:
['CAPEI', 'opmbd', 'debt_ebitda', 'pe_op_basic', 'pe_inc', 'ps', 'pcf', 'ptpm', 'roa', 'totdebt_invcap', 'aftret_equity', 'cash_debt', 'debt_at', 'fcf_ocf', 'sale_equity', 'at_turn', 'rect_turn', 'pay_turn', 'adv_sale', '

In [20]:
# find features that are in all periods
features_in_all_periods = []

for key, value in features_dict.items():
    if len(features_in_all_periods) == 0:
        features_in_all_periods = value
    else:
        features_in_all_periods = [f for f in features_in_all_periods if f in value]

print(f'There are {len(features_in_all_periods)} features that are selected in all periods.')
print('Features that are selected in all periods:')
print(features_in_all_periods)

There are 6 features that are selected in all periods.
Features that are selected in all periods:
['CAPEI', 'fcf_ocf', 'rect_turn', 'pay_turn', 'adv_sale', 'vol']
