In [64]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances

In [65]:
def subset_stock_data(data, start_date, end_date, verbose=False):
    """
    Subsets the given dataframe based on a specified date range.

    Args:
        data (pandas.DataFrame): The dataframe containing the stock data.
        start_date (str or pandas.Timestamp): The start date of the desired date range.
        end_date (str or pandas.Timestamp): The end date of the desired date range.
        verbose (bool, optional): If True, prints a success message. Defaults to False.

    Returns:
        pandas.DataFrame: The subset of the dataframe based on the specified date range.
        
    Raises:
        ValueError: If the dataframe does not contain a 'Date' column.

    """
    # Check if 'Date' column exists in the dataframe
    if 'date' not in data.columns:
        raise ValueError("DataFrame does not contain a 'Date' column.")
     
    # Convert date columns to datetime if they are not already datetime objects
    if not isinstance(data['date'], pd.DatetimeIndex):
        data['date'] = pd.to_datetime(data['date'])
    
    if not isinstance(start_date, pd.Timestamp):
        start_date = pd.to_datetime(start_date)

    if not isinstance(end_date, pd.Timestamp):
        end_date = pd.to_datetime(end_date)

    # Subset the dataframe based on date range
    subset = data[(data['date'] >= start_date) & (data['date'] <=end_date)]
    if verbose:
        print(f'Successfully subsetted data from {start_date} to {end_date}.')
    return subset

In [66]:
def remove_non_numerical_columns(data, verbose=False):
    """
    Remove non-numerical columns from a dataframe.

    Parameters:
    - data: pandas DataFrame
        The input dataframe from which non-numerical columns will be removed.
    - verbose: bool, optional
        If True, print a message with the deleted columns. Default is False.

    Returns:
    - pandas DataFrame
        The dataframe with non-numerical columns removed.
    """

    # Check first 10 rows for numerical columns
    first_10_rows = data.head(10)
    non_numerical_columns = []

    # Iterate through columns
    for column in data.columns:
        # Check if the column contains numerical data
        if pd.api.types.is_numeric_dtype(first_10_rows[column]):
            continue
        else:
            non_numerical_columns.append(column)

    # Remove non-numerical columns from the dataframe
    data = data.copy()
    data.drop(columns=non_numerical_columns, inplace=True)

    # drop date column if it exists
    if 'date' in data.columns:
        data.drop(columns=['date'], inplace=True)

    # Print message with deleted columns
    if verbose:
        if non_numerical_columns:
            print("Successfully removed columns with non-numerical values:", non_numerical_columns)

    return data


In [67]:
def preprocess_data(data, interpolate = False, start_date=None, end_date=None, verbose=False):
    """
    Preprocesses the input data by performing the following steps:
    1. Subset the data based on the specified start and end dates.
    2. Remove non-numerical columns from the subsetted data.
    3. Scale the numerical data using StandardScaler.

    Args:
        data (pd.DataFrame): The input data to be preprocessed.
        start_date (str, optional): The start date for subsetting the data. Defaults to None.
        end_date (str, optional): The end date for subsetting the data. Defaults to None.
        verbose (bool, optional): Whether to print verbose output. Defaults to False.

    Returns:
        np.ndarray: The preprocessed and scaled data.
    """
    if type(data) != pd.DataFrame:
        raise Exception('data must be a pandas dataframe')
    
    subset_data = subset_stock_data(data, start_date, end_date, verbose=verbose)
    subset_numerical_data = remove_non_numerical_columns(subset_data, verbose=verbose)

    if interpolate:
        subset_numerical_data = subset_numerical_data.interpolate(limit_direction='forward')

    subset_numerical_data = subset_numerical_data.drop(columns=['permno'])
        
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(subset_numerical_data)
        
    if verbose:
        print('Successfully scaled data.')
        
    return scaled_data

In [68]:
def find_q(explained_variance, required_explained_var = 0.95):
    """
    Finds the minimum number of principal components (q) required to explain a given amount of variance.

    Parameters:
    explained_variance (list): A list of explained variances for each principal component.
    required_explained_var (float): The required amount of variance to be explained (default is 0.95).

    Returns:
    int: The minimum number of principal components required to explain the given amount of variance.
    """
    cumulative_expl_var = np.cumsum(explained_variance)
    for i,j in enumerate(cumulative_expl_var):
        if j >= required_explained_var:
            q = i+1
            break 
    return q

In [69]:
def fit_pca(data):
    pca = PCA()
    pca.fit(data)
    return [pca.explained_variance_ratio_, pca.components_]

In [70]:
def fit_pfa(data, principal_components, q, diff_n_features):
    """
    Perform feature selection using Principal Feature Analysis (PFA).

    Parameters:
    - data: numpy array
        The input data matrix.
    - principal_components: numpy array
        The principal components obtained from PCA.
    - q: int
        The number of principal components to consider.
    - diff_n_features: int
        The difference between the number of features to select and the number of principal components.

    Returns:
    - indices: list
        The indices of the selected features.
    - features: numpy array
        The selected features from the input data matrix.
    """
    A_q = principal_components.T[:,:q]
    clusternumber = min([q + diff_n_features, data.shape[1]])
        
    kmeans = KMeans(n_clusters = clusternumber).fit(A_q)
    clusters = kmeans.predict(A_q)
    cluster_centers = kmeans.cluster_centers_

    dists = defaultdict(list)
    for i, c in enumerate(clusters):
        dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
        dists[c].append((i, dist))

    indices = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
    features = data[:, indices]
    return indices, features

In [71]:
def transform_pca(data, fitted, principal_components, q, preprocess_data=None):
    """
    Transforms the input data using Principal Component Analysis (PCA).

    Args:
        data (array-like): The input data to be transformed.
        fitted (bool): Indicates whether the PCA model has been fitted to the data.
        principal_components (array-like): The principal components obtained from the PCA model.
        q (int): The number of principal components to keep in the transformed data.
        preprocess_data (function, optional): A function to preprocess the data before transformation.

    Returns:
        array-like: The transformed data with reduced dimensions.

    Raises:
        Exception: If the model has not been fitted to the data.
    """
    if preprocess_data is not None:
        scaled_data = preprocess_data(data)
    else:
        scaled_data = data

    if not fitted:
        raise Exception('The model has not been fitted to the data.')

    print('shape of scaled data: ', np.shape(scaled_data))
    print('shape of transpose of principal components: ', np.shape(np.transpose(principal_components)))
    reduced_data = np.matmul(np.array(scaled_data), np.transpose(principal_components))[:, :q]
    print('shape of reduced data: ', np.shape(reduced_data))
    return reduced_data

def transform_pfa(data, fitted, features, preprocess_data=None):
    if preprocess_data != None:
        scaled_data = preprocess_data(data)
    else:
        scaled_data = data

    if fitted != True:
        raise Exception('The model has not been fitted to the data.')
    return features

In [72]:
def fit_transform(data, method):
    """
    Applies feature selection to the input data using the specified method.

    Args:
        data (numpy.ndarray): The input data to be transformed.
        method (str): The feature selection method to be used. Must be either 'pca' or 'pfa'.

    Returns:
        numpy.ndarray: The transformed data after applying feature selection.

    Raises:
        Exception: If the method is not 'pca' or 'pfa'.
    """
    if method not in ['pca', 'pfa']:
        raise Exception("Method must be either 'pca' or 'pfa'")
    scaled_data = preprocess_data(data)
    if method == 'PCA':
        explained_variance, principal_components = fit_pca(scaled_data)
        q = find_q(explained_variance)
        output = transform_pca(scaled_data, True, principal_components, q)
    elif method == 'PFA':
        explained_variance, principal_components = fit_pca(scaled_data)
        q = find_q(explained_variance)
        diff_n_features = 0
        indices, features = fit_pfa(scaled_data, principal_components, q, diff_n_features)
        output = transform_pfa(scaled_data, True, features)
    return output

In [73]:
# open data
stationary_data = pd.read_csv('../../data/DATA_FINAL.csv')

In [74]:
# set the print limit to 100
pd.set_option('display.max_rows', 1000)

# print the number of missing values in each column
print(f'Missing values per column: \n', stationary_data.isnull().sum())
print('\n')
print('Total number of missing values: ', stationary_data.isnull().sum().sum())

# count the number of rows with missing values
print('Number of rows with missing values: ', stationary_data.isnull().any(axis=1).sum())

Missing values per column: 
 permno                        0
CAPEI                     36141
bm                       111587
evm                       19125
pe_op_basic               58469
pe_op_dil                 59083
pe_exi                    58760
pe_inc                    58553
ps                         9480
pcf                        9290
npm                        9615
opmbd                      9921
opmad                     10290
gpm                        9787
ptpm                       9781
cfm                       25919
roa                       13615
roe                      118917
roce                      26542
aftret_eq                 12097
aftret_invcapx            50333
aftret_equity             12202
GProf                      9123
equity_invcap             11181
debt_invcap               21315
totdebt_invcap            22118
capital_ratio             19169
cash_lt                   10841
debt_at                   20444
debt_ebitda               31042
short_debt 

In [75]:
# drop rows with missing values
stationary_data_full = stationary_data.dropna()

In [76]:
# define a start date and an end date
start_date = '2008-01-01'
start_date = pd.to_datetime(start_date)
number_of_days = 365 * 2
end_date = pd.to_datetime(start_date) + pd.DateOffset(days=number_of_days)

# preprocess the data
scaled_data = preprocess_data(stationary_data_full.copy(), interpolate=False, start_date=start_date, end_date=end_date, verbose=True)

Successfully subsetted data from 2008-01-01 00:00:00 to 2009-12-31 00:00:00.
Successfully removed columns with non-numerical values: ['date']
Successfully scaled data.


In [77]:
print('subset_numerical shape: ', scaled_data.shape)

subset_numerical shape:  (248498, 60)


In [82]:
# perform PFA
explained_variance, principal_components = fit_pca(scaled_data)

q = find_q(explained_variance, required_explained_var=0.8)
print('q: ', q)
diff_n_features = 2
indices, features = fit_pfa(scaled_data, principal_components, q, diff_n_features)

# transform the data
transformed_data = transform_pfa(scaled_data, True, features)

print('shape of transformed data: ', np.shape(transformed_data))

q:  25
shape of transformed data:  (248498, 27)


In [83]:
# print the selected features
columns = [col for col in stationary_data.columns if col not in ['date', 'permno']]
print('Selected features:')
for i in indices:
    print(columns[i])
    
# save the transformed data
np.savetxt('../../data/pfa_transformed_data.csv', transformed_data, delimiter=',')


Selected features:
CAPEI
totdebt_invcap
debt_ebitda
pe_op_basic
pe_inc
ps
ptpm
roce
sale_equity
aftret_equity
aftret_invcapx
at_turn
cash_lt
debt_at
short_debt
fcf_ocf
rect_turn
pay_turn
adv_sale
naics_processed
prc
vol
retx
mktcap
prc_adj
ret_industry_tot
ret_industry_relative
