In [30]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances

In [49]:
def subset_stock_data(data, start_date, end_date, verbose=False):
    """
    Subsets the given dataframe based on a specified date range.

    Args:
        data (pandas.DataFrame): The dataframe containing the stock data.
        start_date (str or pandas.Timestamp): The start date of the desired date range.
        end_date (str or pandas.Timestamp): The end date of the desired date range.
        verbose (bool, optional): If True, prints a success message. Defaults to False.

    Returns:
        pandas.DataFrame: The subset of the dataframe based on the specified date range.
        
    Raises:
        ValueError: If the dataframe does not contain a 'Date' column.

    """
    # Check if 'Date' column exists in the dataframe
    if 'date' not in data.columns:
        raise ValueError("DataFrame does not contain a 'Date' column.")
     
    # Convert date columns to datetime if they are not already datetime objects
    if not isinstance(data['date'], pd.DatetimeIndex):
        data['date'] = pd.to_datetime(data['date'])
    
    if not isinstance(start_date, pd.Timestamp):
        start_date = pd.to_datetime(start_date)

    if not isinstance(end_date, pd.Timestamp):
        end_date = pd.to_datetime(end_date)

    # Subset the dataframe based on date range
    subset = data[(data['date'] >= start_date) & (data['date'] <=end_date)]
    if verbose:
        print(f'Successfully subsetted data from {start_date} to {end_date}.')
    return subset

In [32]:
def remove_non_numerical_columns(data, verbose=False):
    """
    Remove non-numerical columns from a dataframe.

    Parameters:
    - data: pandas DataFrame
        The input dataframe from which non-numerical columns will be removed.
    - verbose: bool, optional
        If True, print a message with the deleted columns. Default is False.

    Returns:
    - pandas DataFrame
        The dataframe with non-numerical columns removed.
    """

    # Check first 10 rows for numerical columns
    first_10_rows = data.head(10)
    non_numerical_columns = []

    # Iterate through columns
    for column in data.columns:
        # Check if the column contains numerical data
        if pd.api.types.is_numeric_dtype(first_10_rows[column]):
            continue
        else:
            non_numerical_columns.append(column)

    # Remove non-numerical columns from the dataframe
    data = data.copy()
    data.drop(columns=non_numerical_columns, inplace=True)

    # Print message with deleted columns
    if verbose:
        if non_numerical_columns:
            print("Successfully removed columns with non-numerical values:", non_numerical_columns)

    return data


In [104]:
def preprocess_data(data, interpolate = False, start_date=None, end_date=None, verbose=False):
    """
    Preprocesses the input data by performing the following steps:
    1. Subset the data based on the specified start and end dates.
    2. Remove non-numerical columns from the subsetted data.
    3. Scale the numerical data using StandardScaler.

    Args:
        data (pd.DataFrame): The input data to be preprocessed.
        start_date (str, optional): The start date for subsetting the data. Defaults to None.
        end_date (str, optional): The end date for subsetting the data. Defaults to None.
        verbose (bool, optional): Whether to print verbose output. Defaults to False.

    Returns:
        np.ndarray: The preprocessed and scaled data.
    """
    if type(data) != pd.DataFrame:
        raise Exception('data must be a pandas dataframe')
    
    subset_data = subset_stock_data(data, start_date, end_date, verbose=verbose)
    subset_numerical_data = remove_non_numerical_columns(subset_data, verbose=verbose)

    if interpolate:
        subset_numerical_data = subset_numerical_data.interpolate()

    subset_numerical_data = subset_numerical_data.drop(columns=['permno'])
        
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(subset_numerical_data)
        
    if verbose:
        print('Successfully scaled data.')
        
    return scaled_data

In [65]:
def find_q(explained_variance, required_explained_var = 0.95):
    """
    Finds the minimum number of principal components (q) required to explain a given amount of variance.

    Parameters:
    explained_variance (list): A list of explained variances for each principal component.
    required_explained_var (float): The required amount of variance to be explained (default is 0.95).

    Returns:
    int: The minimum number of principal components required to explain the given amount of variance.
    """
    cumulative_expl_var = np.cumsum(explained_variance)
    for i,j in enumerate(cumulative_expl_var):
        if j >= required_explained_var:
            q = i+1
            break 
    return q

In [35]:
def fit_pca(data):
    pca = PCA()
    pca.fit(data)
    return [pca.explained_variance_ratio_, pca.components_]

In [59]:
def fit_pfa(data, principal_components, q, diff_n_features):
    """
    Perform feature selection using Principal Feature Analysis (PFA).

    Parameters:
    - data: numpy array
        The input data matrix.
    - principal_components: numpy array
        The principal components obtained from PCA.
    - q: int
        The number of principal components to consider.
    - diff_n_features: int
        The difference between the number of features to select and the number of principal components.

    Returns:
    - indices: list
        The indices of the selected features.
    - features: numpy array
        The selected features from the input data matrix.
    """
    A_q = principal_components.T[:,:q]
    clusternumber = min([q + diff_n_features, data.shape[1]])
        
    kmeans = KMeans(n_clusters = clusternumber).fit(A_q)
    clusters = kmeans.predict(A_q)
    cluster_centers = kmeans.cluster_centers_

    dists = defaultdict(list)
    for i, c in enumerate(clusters):
        dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
        dists[c].append((i, dist))

    indices = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
    features = data[:, indices]
    return indices, features

In [37]:
def transform_pca(data, fitted, principal_components, q, preprocess_data=None):
    """
    Transforms the input data using Principal Component Analysis (PCA).

    Args:
        data (array-like): The input data to be transformed.
        fitted (bool): Indicates whether the PCA model has been fitted to the data.
        principal_components (array-like): The principal components obtained from the PCA model.
        q (int): The number of principal components to keep in the transformed data.
        preprocess_data (function, optional): A function to preprocess the data before transformation.

    Returns:
        array-like: The transformed data with reduced dimensions.

    Raises:
        Exception: If the model has not been fitted to the data.
    """
    if preprocess_data is not None:
        scaled_data = preprocess_data(data)
    else:
        scaled_data = data

    if not fitted:
        raise Exception('The model has not been fitted to the data.')

    print('shape of scaled data: ', np.shape(scaled_data))
    print('shape of transpose of principal components: ', np.shape(np.transpose(principal_components)))
    reduced_data = np.matmul(np.array(scaled_data), np.transpose(principal_components))[:, :q]
    print('shape of reduced data: ', np.shape(reduced_data))
    return reduced_data

def transform_pfa(data, fitted, features, preprocess_data=None):
    if preprocess_data != None:
        scaled_data = preprocess_data(data)
    else:
        scaled_data = data

    if fitted != True:
        raise Exception('The model has not been fitted to the data.')
    return features

In [38]:
def fit_transform(data, method):
    """
    Applies feature selection to the input data using the specified method.

    Args:
        data (numpy.ndarray): The input data to be transformed.
        method (str): The feature selection method to be used. Must be either 'pca' or 'pfa'.

    Returns:
        numpy.ndarray: The transformed data after applying feature selection.

    Raises:
        Exception: If the method is not 'pca' or 'pfa'.
    """
    if method not in ['pca', 'pfa']:
        raise Exception("Method must be either 'pca' or 'pfa'")
    scaled_data = preprocess_data(data)
    if method == 'PCA':
        explained_variance, principal_components = fit_pca(scaled_data)
        q = find_q(explained_variance)
        output = transform_pca(scaled_data, True, principal_components, q)
    elif method == 'PFA':
        explained_variance, principal_components = fit_pca(scaled_data)
        q = find_q(explained_variance)
        diff_n_features = 0
        indices, features = fit_pfa(scaled_data, principal_components, q, diff_n_features)
        output = transform_pfa(scaled_data, True, features)
    return output

In [39]:
# open the data
data = pd.read_csv('../../data/merged_fin_with_rets.csv')

data.head()

Unnamed: 0,date,permno,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,...,retx,mktcap,prc_adj,naics_processed,ret_industry_tot,ret_industry_relative,MACD_index,rsi,12_month_return,3_month_return
0,2000-02-01,10078,189.489,0.071,18.262,97.593,103.372,109.115,109.115,9.857,...,0.027844,126059300.0,70.759885,,,,1.928924,50.010735,,
1,2000-02-01,85072,3.754,0.408,6.949,10.938,10.938,14.583,15.179,0.262,...,-0.016807,482976.0,7.758214,,,,-10.117663,15.602803,,
2,2000-02-01,70536,22.901,0.411,10.796,10.806,10.806,10.948,9.795,3.133,...,0.04291,3524984.0,3.170199,,,,-6.217865,39.333548,,
3,2000-02-01,16432,6.427,0.489,15.225,12.701,12.701,11.418,11.418,0.288,...,0.010526,3751944.0,18.070428,,,,-15.730315,16.878554,,
4,2000-02-01,85035,2138.2,0.07,37.731,316.098,342.857,342.857,342.857,28.234,...,-0.00463,6434681.0,145.769114,,,,33.705124,54.950281,,


In [40]:
print(data.isna().sum())

date                        633
permno                        0
CAPEI                     32101
bm                       105895
evm                       15009
                          ...  
ret_industry_relative    601763
MACD_index                  633
rsi                         633
12_month_return          181930
3_month_return            46242
Length: 81, dtype: int64


In [41]:
featurelist =['CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil',
       'pe_exi', 'pe_inc', 'ps', 'pcf', 'dpr', 'npm', 'opmbd', 'opmad', 'gpm',
       'ptpm', 'cfm', 'roa', 'roe', 'roce', 'efftax', 'aftret_eq',
       'aftret_invcapx', 'aftret_equity', 'pretret_noa', 'pretret_earnat',
       'GProf', 'equity_invcap', 'debt_invcap', 'totdebt_invcap',
       'capital_ratio', 'int_debt', 'int_totdebt', 'cash_lt', 'invt_act',
       'rect_act', 'debt_at', 'debt_ebitda', 'short_debt', 'curr_debt',
       'lt_debt', 'profit_lct', 'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent',
       'dltt_be', 'debt_assets', 'debt_capital', 'de_ratio', 'intcov',
       'intcov_ratio', 'cash_ratio', 'quick_ratio', 'curr_ratio', 'inv_turn',
       'at_turn', 'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'divyield',
       'prc', 'vol', 'ret', 'retx', 'mktcap', 'prc_adj', 'naics_processed',
       'ret_industry_tot', 'ret_industry_relative', 'MACD_index', 'rsi',
       '12_month_return', '3_month_return']

# make list of unique values in permno column
permno_list = data['permno'].unique()

In [42]:
stationary_data = data.copy()

for i, permno in enumerate(permno_list):

    # subset the data by permno, and featurelist
    subset = stationary_data[stationary_data['permno'] == permno][featurelist]

    # stationarize the data by differencing
    subset_diff = subset.diff()

    # insert the data back into the original dataframe
    stationary_data.loc[stationary_data['permno'] == permno, featurelist] = subset_diff

    # drop the first row of the subsetted data
    stationary_data.drop(stationary_data[stationary_data['permno'] == permno].index[0], inplace=True)

    print(f'{i+1}/{len(permno_list)} permnos processed.')

stationary_data.head()

1/727 permnos processed.
2/727 permnos processed.
3/727 permnos processed.
4/727 permnos processed.
5/727 permnos processed.
6/727 permnos processed.
7/727 permnos processed.
8/727 permnos processed.
9/727 permnos processed.
10/727 permnos processed.
11/727 permnos processed.
12/727 permnos processed.
13/727 permnos processed.
14/727 permnos processed.
15/727 permnos processed.
16/727 permnos processed.
17/727 permnos processed.
18/727 permnos processed.
19/727 permnos processed.
20/727 permnos processed.
21/727 permnos processed.
22/727 permnos processed.
23/727 permnos processed.
24/727 permnos processed.
25/727 permnos processed.
26/727 permnos processed.
27/727 permnos processed.
28/727 permnos processed.
29/727 permnos processed.
30/727 permnos processed.
31/727 permnos processed.
32/727 permnos processed.
33/727 permnos processed.
34/727 permnos processed.
35/727 permnos processed.
36/727 permnos processed.
37/727 permnos processed.
38/727 permnos processed.
39/727 permnos proces

Unnamed: 0,date,permno,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,...,retx,mktcap,prc_adj,naics_processed,ret_industry_tot,ret_industry_relative,MACD_index,rsi,12_month_return,3_month_return
523,2000-02-02,28804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005891,1650.188,0.017656,,,,1.174333,0.174136,,
524,2000-02-02,85792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014297,44043.56,0.856897,,,,1.701794,5.109275,,
525,2000-02-02,15202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.001477,-6253.438,-0.021657,,,,-0.175282,-0.891143,,
526,2000-02-02,44644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.070285,-1640908.0,1.293851,,,,1.202515,-2.069857,,
527,2000-02-02,71175,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.002103,-90010.5,-0.473719,,,,1.336858,-0.341246,,


In [66]:
print(stationary_data.isna().sum())

date                        631
permno                        0
CAPEI                     32111
bm                       106000
evm                       15037
                          ...  
ret_industry_relative    601758
MACD_index                  631
rsi                         631
12_month_return          181918
3_month_return            46235
Length: 81, dtype: int64


In [67]:
# reset the index
stationary_data.reset_index(drop=True, inplace=True)

# save the stationary data
stationary_data.to_csv('../../data/stationary_data.csv', index=False)

KeyboardInterrupt: 

In [118]:
# define a start date and an end date
start_date = '2008-01-01'
start_date = pd.to_datetime(start_date)
number_of_days = 365
end_date = pd.to_datetime(start_date) + pd.DateOffset(days=number_of_days)

# preprocess the data
scaled_data = preprocess_data(stationary_data.copy(), interpolate=True, start_date=start_date, end_date=end_date, verbose=True)

Successfully subsetted data from 2008-01-01 00:00:00 to 2008-12-31 00:00:00.
Successfully removed columns with non-numerical values: ['date']
Successfully scaled data.


In [119]:
print('subset_numerical shape: ', scaled_data.shape)

subset_numerical shape:  (156033, 79)


In [120]:
# perform PFA
explained_variance, principal_components = fit_pca(scaled_data)

q = find_q(explained_variance, required_explained_var=0.7)
print('q: ', q)
diff_n_features = 3
indices, features = fit_pfa(subset_interpolated, principal_components, q, diff_n_features)

# transform the data
transformed_data = transform_pfa(subset_interpolated, True, features)

print('shape of transformed data: ', np.shape(transformed_data))

q:  27
q is:  27


InvalidIndexError: (slice(None, None, None), [0, 65, 36, 3, 6, 64, 9, 12, 25, 61, 19, 20, 28, 31, 51, 35, 37, 41, 44, 59, 50, 57, 62, 63, 66, 67, 68, 71, 73, 74])