# Feature Analysis

To test the feature relevance on the prediction model, we consider tests based on the two type of predictions we want to test:

1. Regression

    1. Models: RandomForest, LSTM


2. Classification

    1. Models: RandomForest, SVM

## Methodology

### Hybrid Method

The methodology consist of two main steps:

1. Feature Filtering

    1. Simple correlation

    2. Causality

    3. Homoskedasticity and Endogeneity 

    4. mFCBF algorithm (correlation algorithm)

2. Model application (regression/classificaion)
    
    Which will work as our metric system.

    The method will GridSearch for the best parameters, and the full distribution of results is used as the evaluation criteria.


In [82]:
import warnings
from functions import *
from trading import Asset
from datetime import date, datetime
from trading.func_aux import get_assets, min_max
import json
import pandas as pd
from copy import deepcopy, copy

#### Aux cells

In [None]:
inst = Asset(
    symbol = "AAPL",
    broker = "gbm",
    fiat = "mx",
    start = date(2000,1,1),
    end = date(2022,1,1),
    frequency = "1m",
    from_ = "db"
)

## Configuration
### Update Google Trends

In [None]:
gt = {}
for i in get_assets()["gbm"]:
    aux = i.lower()
    gt[i] = {"google_trends": [ aux, "{} price".format(aux), "{} stock".format(aux) ]}

with open("google_trends_keywords.json", "w") as fp:
    json.dump( gt, fp )

## add_assets -broker gbm -json google_trends_keywords.json -augment

### Update DB with Google Trends

In [None]:
for i in get_assets()["gbm"]:
    print(i)
    inst = Asset( i , fiat = "mx", broker = "gbm", start = date(2000,1,1), end = date(2022,2,1), frequency="1m")
    df = inst.google_trends(from_="api")

## Auxiliar Functions

In [2]:
def features(df, periods = 3):
    cols = df.columns
    new_cols = []
    for c in cols:
        for i in range(1, periods+1):
            aux = "{}_{}".format(c, i)
            df[ aux ] = df[ c ].pct_change( periods = i )
            new_cols.append( aux )

    return df, new_cols

In [3]:
def target(df, target = "close", periods = 3):
    for i in range(1, periods+1):
        df[ "{}_{}".format("target", i) ] = df[ target ].pct_change(periods=i).shift( -i )
    
    return df

In [4]:
def grid_search(df, params, **kwargs):
    m = MyGridSearch(
        df,
        regr = params["regr"],
        parameters=params["paremeters"],
        train_test_split= kwargs.get("train_test_split", 0.8),
        target = kwargs.get("target", "target"),
        error = params["error"],
        error_ascending=kwargs.get("error_ascending", True)
    )

    m.test()

    return m.cache

In [5]:
regr = {
    "rf":{
        "regr":RandomForestRegressor(),
        "paremeters":{
            "n_estimators":[10, 20, 50, 100, 200],
            "criterion":["squared_error", "absolute_error"]
        },
        "error":mean_squared_error
    },

    # "regr_lstm":{
        
    # },
}

clf = {
    "clf_rf":{
        "regr":RandomForestClassifier(),
        "paremeters":{
            "n_estimators":[10, 20, 50, 100, 200],
            "criterion":["gini", "entropy"]
        },
        "error":precision_score
    },

    "clf_svm":{
        "regr":SVC(),
        "parameters":{
            "C":[0.01, 0.1, 1.0, 10],
            "kernel":["linear", "poly", "rbf", "sigmoid"],
        },
        "error":precision_score
    }
}


In [6]:
def model_regr(inst, target, **kwargs):
    best = {}
    for i, v in regr.items():

        best[i] = grid_search(
            inst.df,
            v,
            target = target,
            **kwargs
        )
    
    return best

In [83]:
def main(func, targets = 5 , type = "regr", feature_relevance = ["causality"], **kwargs):

    targets = [ "{}_{}".format( "target", i ) for i in range(1, targets+1) ]

    assets = get_assets()[ "gbm" ]
    results = {}

    for i in list(assets.keys())[ :10 ] :
        results[i] = {}

        inst = Asset(
            symbol=i,
            broker="gbm",
            fiat = "mx",
            start = date(2000,1,1),
            end = date(2022,2,1),
            frequency="1m",
            from_ = "db"
        )

        inst = func( inst, type = type )

        for fr in feature_relevance:

            results[i][fr] = {}

            for target in targets:
                
                results[i][fr][target] = {}

                inst_aux = deepcopy(inst)

                if fr is not None:
                    inst_aux, good = {
                        "causality":causality,
                        "correlation":correlation,
                        "mFCBF":mFCBF
                    }[fr](inst_aux, copy(targets), target)
                else:
                    good = True

                results[ i ][ fr ][target]["cols"] = set( inst_aux.df.columns ) - set([ target ])

                if not good:
                    warnings.warn("Target {} for {} with {} has no attributes".format( target, i, fr ))
                    continue

                if type == "regr":
                    results[i][fr][target]["regr"] = model_regr( inst_aux, target=target, **kwargs )
                else:
                    pass
    
    return results

### Feature Relevance

In [75]:
def causality(inst, targets, target):

    cols = list(inst.df.columns)
   
    targets.remove(target)
    for i in targets:
        cols.remove( i )

    lag = int( target.split("_")[-1] )

    inst.df.dropna( inplace = True )

    cols = inst.causality( df = inst.df[cols], targets = [target],  lag = lag, verbose = 0)

    cols = cols[ cols[target] == 1 ].index.tolist()

    if len(cols) == 0:
        return inst, False

    inst.df = inst.df[ cols + [target] ]

    return inst, True


In [76]:
def correlation(inst, targets, target):

    cols = list(inst.df.columns)

    targets.remove(target)
    for i in targets: cols.remove( i )

    cols = inst.corr( df = inst.df[ cols ], targets = [target] )

    cols = cols[ cols[target] < 0.9 ].index.tolist()

    if len(cols) == 0:
        return inst, False

    inst.df = inst.df[ cols + [target] ]

    return inst, True

In [77]:
def mFCBF(inst, targets, target):
    cols = list(inst.df.columns)

    targets.remove(target)
    for i in targets: 
        cols.remove( i )
    
    cols = inst.df[ cols ].corr()


    cols = cols[ cols[target] > 0.2 ].index.tolist()

    cols.remove( target )

    df = copy(inst.df)

    cols = inst.redundancy( df = inst.df[ cols ].corr(), threshold = 0.8, above = False )
    
    inst.df = df

    if len(cols) == 0:
        return inst, False

    inst.df = inst.df[ cols + [target] ]

    return inst, True

## Results

### Historic Data

In [78]:
def func_historic_data(inst, type = "regr"):

    cols = list(inst.df.columns)

    df, new_cols = features( inst.df, periods=5 )

    if type == "clf":
        for c in new_cols:
            df[ c ] = df[ c ].apply( lambda x : 1 if x > 0 else 0 )

    df = target( df, periods=5 )

    cols.remove( "volume" )

    df.drop(columns= cols, inplace=True )
    
    inst.df = df

    return inst

In [79]:
r = main( func = func_historic_data, targets=5, feature_relevance= [ "mFCBF", "causality", "correlation"] )

Target target_1 for AAL with mFCBF has no attributes
Target target_3 for AAL with mFCBF has no attributes
Target target_4 for AAL with mFCBF has no attributes
Target target_5 for AAL with mFCBF has no attributes
Target target_1 for AAPL with mFCBF has no attributes
Target target_2 for AAPL with mFCBF has no attributes
Target target_3 for AAPL with mFCBF has no attributes
Target target_4 for AAPL with mFCBF has no attributes
Target target_5 for AAPL with mFCBF has no attributes
Target target_2 for AAXJ with mFCBF has no attributes
Target target_1 for ABBV with mFCBF has no attributes
Target target_2 for ABBV with mFCBF has no attributes
Target target_3 for ABBV with mFCBF has no attributes
Target target_4 for ABBV with mFCBF has no attributes
Target target_5 for ABBV with mFCBF has no attributes
Target target_1 for ABT with mFCBF has no attributes
Target target_3 for ABT with mFCBF has no attributes
Target target_4 for ABT with mFCBF has no attributes
Target target_5 for ABT with mFCBF 

### Google Trends

In [None]:
def func_google_trends(inst):
    df = inst.google_trends( from_ = "db" )


### MacroEconomic Variables Analysis

In [None]:
def func_mev(inst):
    pass

### Technical Analysis features

In [None]:
def func_ta(inst):
    pass