In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
class Feature_Selection:
    def __init__(self, start_date, end_date, method='pfa', verbose=False):
        #TODO check start date and end date are in date format
        if method not in ['pca', 'pfa']:
            raise Exception('Method must be either PCA or PFA')
        #TODO check there is data for each stock in the selected period

        self.method = method
        self.start_date = start_date
        self.end_date = end_date
        
        #For pfa
        self.pfa_features = None
        
        #For pca
        self.principal_components = None
        self.explained_variance_ratio = None


    def fit(self, data):
        if type(data) != pd.DataFrame:
            raise Exception('data must be a pandas dataframe')
        
        subset_data = self.subset_stock_data(data)
    
        subset_numerical_data = self.subset_stock_data(subset_data)
        scaler = StandardScaler()
        scaled_data = scaler.transform(subset_numerical_data) #TODO CHECK WEATHER WE PREFER OTHER TRANSFORMATION
        
        if self.verbose:
            print('Succesfully scaled data.')
        
        if self.method == 'pfa':
            self.run_pca(scaled_data)
        
        elif self.method == 'pca':
            self.pfa_features = self.run_pfa(scaled_data)

    def subset_stock_data(self, data):
        # Check if 'Date' column exists in the dataframe
        if 'Date' not in self.data.columns:
            raise ValueError("DataFrame does not contain a 'Date' column.")
        
        # Convert date columns to datetime if they are not already datetime objects
        if not isinstance(self.data['Date'], pd.DatetimeIndex):
            self.data['Date'] = pd.to_datetime(self.data['Date'])
        
        # Subset the dataframe based on date range
        subset = data[(data['Date'] >= self.start_date) & (data['Date'] <= self.end_date)]
        if self.verbose:
            print('Succesfully subsetted data for selected timeframe.')
        return subset
    
    def remove_non_numerical_columns(self, data):
        # Check first 10 rows for numerical columns
        first_10_rows = data.head(10)
        non_numerical_columns = []

        # Iterate through columns
        for column in data.columns:
            # Check if the column contains numerical data
            if pd.api.types.is_numeric_dtype(first_10_rows[column]):
                continue
            else:
                non_numerical_columns.append(column)

        # Remove non-numerical columns from the dataframe
        data.drop(columns=non_numerical_columns, inplace=True)
        
        # Print message with deleted columns
        if self.verbose:
            if non_numerical_columns:
                print("Columns with non-numerical values removed:", non_numerical_columns)
            else:
                print("No columns with non-numerical values found.")
        return data
    
    def run_pca(self, data):
        pca = PCA()
        self.principal_components = pca.fit_transform(data)
        self.explained_variance_ratio = pca.explained_variance_ratio_


    def run_pfa(self, df):
        #TODO
        pass
