In [860]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances

In [861]:
class Feature_Selection:
    def __init__(self, start_date, end_date, 
                 method='pfa', q = None, explained_var= 0.95, 
                 diff_n_features = 2, verbose=False):
       
        if not isinstance(start_date, pd.Timestamp):
            start_date = pd.to_datetime(start_date)
        # Convert end_date to datetime if it's not already
        if not isinstance(end_date, pd.Timestamp):
            end_date = pd.to_datetime(end_date)
        if method not in ['pca', 'pfa']:
            raise Exception('Method must be either PCA or PFA')
        #TODO check there is data for each stock in the selected period

        self.method = method
        self.start_date = start_date
        self.end_date = end_date
        self.verbose = verbose

        self.diff_n_features = diff_n_features
        self.q = q
        
        self.explained_var = explained_var

        self.fitted = False

        #For pca
        self.pca = None
        self.explained_variance = None
        self.principal_components = None
        self.reduced_data = None
        
        #For pfa
        self.indices = None
        self.features = None

    def subset_stock_data(self, data):
        # Check if 'Date' column exists in the dataframe
        if 'Date' not in data.columns:
            raise ValueError("DataFrame does not contain a 'Date' column.")
        
        # Convert date columns to datetime if they are not already datetime objects
        if not isinstance(data['Date'], pd.DatetimeIndex):
            data['Date'] = pd.to_datetime(data['Date'])
        
        # Subset the dataframe based on date range
        subset = data[(data['Date'] >= self.start_date) & (data['Date'] <= self.end_date)]
        if self.verbose:
            print('Succesfully subsetted data for selected timeframe.')
        return subset
        
    def remove_non_numerical_columns(self, data):
        # Check first 10 rows for numerical columns
        first_10_rows = data.head(10)
        non_numerical_columns = []

        # Iterate through columns
        for column in data.columns:
            # Check if the column contains numerical data
            if pd.api.types.is_numeric_dtype(first_10_rows[column]):
                continue
            else:
                non_numerical_columns.append(column)

        # Remove non-numerical columns from the dataframe
        data = data.copy()
        data.drop(columns=non_numerical_columns, inplace=True)
        
        # Print message with deleted columns
        if self.verbose:
            if non_numerical_columns:
                print("Succesfully removed columns with non-numerical values:", non_numerical_columns)
      
        return data
    
    def preprocess_data(self, data):
        if type(data) != pd.DataFrame:
            raise Exception('data must be a pandas dataframe')
        
        subset_data = self.subset_stock_data(data)
        print(np.shape(subset_data))
        subset_numerical_data = self.remove_non_numerical_columns(subset_data)
        
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(subset_numerical_data) #TODO CHECK WEATHER WE PREFER OTHER TRANSFORMATION
        
        if self.verbose:
            print('Succesfully scaled data.')

        return scaled_data
    
    def find_q(self):
        explained_variance = self.explained_variance
        cumulative_expl_var = [sum(explained_variance[:i+1]) for i in range(len(explained_variance))]
        for i,j in enumerate(cumulative_expl_var):
            if j >= self.explained_var:
                self.q = i
                break   
 
    def fit_pca(self, X):
        self.pca = PCA()
        self.pca.fit(X)

        self.principal_components = self.pca.components_
        self.explained_variance = self.pca.explained_variance_ratio_
        if self.q == None:
            self.find_q()       

    def fit_pfa(self, X):
        q = self.q      
        A_q = self.principal_components.T[:,:q]
        
        clusternumber = min([q + self.diff_n_features, X.shape[1]])
        
        kmeans = KMeans(n_clusters = clusternumber).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_

        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))

        self.indices = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features = X[:, self.indices]
     
    def fit(self, data, preprocess = True):
        if preprocess:
            scaled_data = self.preprocess_data(data)
        else:
            scaled_data = data

        self.fit_pca(scaled_data)

        if self.verbose:
            print('Succesfully performed Principal Component Analysis')

        if self.method == 'pfa':
            self.fit_pfa(scaled_data)

            if self.verbose:
                print('Succesfully performed Principal Feature Analysis')

        self.fitted = True

    def transform(self, data, preprocess = True):
        if preprocess:
            scaled_data = self.preprocess_data(data)
        else:
            scaled_data = data

        if self.fitted != True:
            raise Exception('The model has not been fitted to the data.')
        
        if self.method == 'pca':
            print('shape of scaled data: ', np.shape(scaled_data))
            print('shape of transpose of principal components: ', np.shape(np.transpose(self.principal_components)))
            self.reduced_data = np.matmul(np.array(scaled_data), np.transpose(self.principal_components))[:, :self.q]
            print('shape of reduced data: ', np.shape(self.reduced_data))
            return self.reduced_data
        
        elif self.method == 'pfa':
            return self.features

    def fit_transform(self, data):
        scaled_data = self.preprocess_data(data)

        self.fit(scaled_data)
        output = self.transform(scaled_data)
        return output