In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
def subset_stock_data(data, start_date, end_date, verbose=False):
    # Check if 'Date' column exists in the dataframe
    if 'Date' not in data.columns:
        raise ValueError("DataFrame does not contain a 'Date' column.")
        
    # Convert date columns to datetime if they are not already datetime objects
    if not isinstance(data['Date'], pd.DatetimeIndex):
        data['Date'] = pd.to_datetime(data['Date'])
    
    if not isinstance(start_date, pd.Timestamp):
        start_date = pd.to_datetime(start_date)

    if not isinstance(end_date, pd.Timestamp):
        end_date = pd.to_datetime(end_date)

    # Subset the dataframe based on date range
    subset = data[(data['Date'] >= start_date) & (data['Date'] <=end_date)]
    if verbose:
        print(f'Succesfully subsetted data from {start_date} to {end_date}.')
    return subset

In [3]:
def remove_non_numerical_columns(data, verbose=False):     
    # Check first 10 rows for numerical columns
    first_10_rows = data.head(10)
    non_numerical_columns = []

    # Iterate through columns
    for column in data.columns:
        # Check if the column contains numerical data
        if pd.api.types.is_numeric_dtype(first_10_rows[column]):
            continue
        else:
            non_numerical_columns.append(column)

        # Remove non-numerical columns from the dataframe
        data = data.copy()
        data.drop(columns=non_numerical_columns, inplace=True)
        
        # Print message with deleted columns
        if verbose:
            if non_numerical_columns:
                print("Succesfully removed columns with non-numerical values:", non_numerical_columns)
      
        return data

In [4]:
def preprocess_data(data, start_date=None, end_date=None, verbose = False):
    if type(data) != pd.DataFrame:
        raise Exception('data must be a pandas dataframe')
        
    subset_data = subset_stock_data(data, start_date, end_date, verbose=verbose)
    #print(np.shape(subset_data))
    subset_numerical_data = remove_non_numerical_columns(subset_data, verbose=verbose)
        
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(subset_numerical_data) #TODO CHECK WETHER WE PREFER OTHER TRANSFORMATION
        
    if verbose:
        print('Succesfully scaled data.')
        
    return scaled_data

In [5]:
def find_q(explained_variance, required_explained_var = 0.95):
    cumulative_expl_var = [sum(explained_variance[:i+1]) for i in range(len(explained_variance))]
    for i,j in enumerate(cumulative_expl_var):
        if j >= required_explained_var:
            q = i
            break 
    return q  

In [6]:
def fit_pca(data):
    pca = PCA()
    pca.fit(data)
    return [pca.explained_variance_ratio_, pca.components_]

In [7]:
def fit_pfa(data, principal_components, q, diff_n_features):
    A_q = principal_components.T[:,:q]
    clusternumber = min([q + diff_n_features, data.shape[1]])
        
    kmeans = KMeans(n_clusters = clusternumber).fit(A_q)
    clusters = kmeans.predict(A_q)
    cluster_centers = kmeans.cluster_centers_

    dists = defaultdict(list)
    for i, c in enumerate(clusters):
        dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
        dists[c].append((i, dist))

    indices = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
    features = data[:, indices]
    return indices, features

In [8]:
def transform_pca(data, fitted, principal_components,q, preprocess_data=None):
    if preprocess_data != None:
        scaled_data = preprocess_data(data)
    else:
        scaled_data = data

    if fitted != True:
        raise Exception('The model has not been fitted to the data.')
        
        
    print('shape of scaled data: ', np.shape(scaled_data))
    print('shape of transpose of principal components: ', np.shape(np.transpose(principal_components)))
    reduced_data = np.matmul(np.array(scaled_data), np.transpose(principal_components))[:, :q]
    print('shape of reduced data: ', np.shape(reduced_data))
    return reduced_data

def transform_pfa(data, fitted, features, preprocess_data=None):
    if preprocess_data != None:
        scaled_data = preprocess_data(data)
    else:
        scaled_data = data

    if fitted != True:
        raise Exception('The model has not been fitted to the data.')
    return features

In [9]:
def fit_transform(data, method):
    if method not in ['pca', 'pfa']:
        raise Exception("Method must be either 'pca' or 'pfa'")
    scaled_data = preprocess_data(data)
    if method == 'PCA':
        explained_variance, principal_components = fit_pca(scaled_data)
        q = find_q(explained_variance)
        output = transform_pca(scaled_data, True, principal_components, q)
    elif method == 'PFA':
        explained_variance, principal_components = fit_pca(scaled_data)
        q = find_q(explained_variance)
        diff_n_features = 0
        indices, features = fit_pfa(scaled_data, principal_components, q, diff_n_features)
        output = transform_pfa(scaled_data, True, features)
    return output