In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
class Feature_Selection:
    def __init__(self, start_date, end_date, method='pfa', q = None, explained_variance = 0.95, diff_n_features = 2, verbose=False):
        #TODO check start date and end date are in date format
        if method not in ['pca', 'pfa']:
            raise Exception('Method must be either PCA or PFA')
        #TODO check there is data for each stock in the selected period

        self.method = method
        self.start_date = start_date
        self.end_date = end_date

        self.diff_n_features = diff_n_features
        self.q = q
        
        self.explained_variance = explained_variance
        
        #For pfa
        self.indices = None
        self.features = None
        
        #For pca
        self.principal_components = None
        

    def fit(self, data):
        if type(data) != pd.DataFrame:
            raise Exception('data must be a pandas dataframe')
        
        subset_data = self.subset_stock_data(data)
    
        subset_numerical_data = self.subset_stock_data(subset_data)
        scaler = StandardScaler()
        scaled_data = scaler.transform(subset_numerical_data) #TODO CHECK WEATHER WE PREFER OTHER TRANSFORMATION
        
        if self.verbose:
            print('Succesfully scaled data.')

        self.fit_pca(scaled_data)

        if self.method == 'pfa':
            self.fit_pfa(scaled_data)
        

    def subset_stock_data(self, data):
        # Check if 'Date' column exists in the dataframe
        if 'Date' not in self.data.columns:
            raise ValueError("DataFrame does not contain a 'Date' column.")
        
        # Convert date columns to datetime if they are not already datetime objects
        if not isinstance(self.data['Date'], pd.DatetimeIndex):
            self.data['Date'] = pd.to_datetime(self.data['Date'])
        
        # Subset the dataframe based on date range
        subset = data[(data['Date'] >= self.start_date) & (data['Date'] <= self.end_date)]
        if self.verbose:
            print('Succesfully subsetted data for selected timeframe.')
        return subset
    
    def remove_non_numerical_columns(self, data):
        # Check first 10 rows for numerical columns
        first_10_rows = data.head(10)
        non_numerical_columns = []

        # Iterate through columns
        for column in data.columns:
            # Check if the column contains numerical data
            if pd.api.types.is_numeric_dtype(first_10_rows[column]):
                continue
            else:
                non_numerical_columns.append(column)

        # Remove non-numerical columns from the dataframe
        data.drop(columns=non_numerical_columns, inplace=True)
        
        # Print message with deleted columns
        if self.verbose:
            if non_numerical_columns:
                print("Columns with non-numerical values removed:", non_numerical_columns)
            else:
                print("No columns with non-numerical values found.")
        return data
    
    def fit_pca(self, X):
        pca = PCA()
        self.principal_components = pca.fit(X)
        self.principal_components = pca.components_
        self.explained_variance = pca.explained_variance_ratio_


    def fit_pfa(self, X):
        
        if not self.q:
            explained_variance = self.explained_variance
            cumulative_expl_var = [sum(explained_variance[:i+1]) for i in range(len(explained_variance))]
            for i,j in enumerate(cumulative_expl_var):
                if j >= self.explained_var:
                    q = i
                    break
                    
        A_q = self.principal_components.T[:,:q]
        
        clusternumber = min([q + self.diff_n_features, X.shape[1]])
        
        kmeans = KMeans(n_clusters= clusternumber).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_

        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))

        self.indices = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features = X[:, self.indices]
