In [3]:
!pip install yfinance



In [4]:
# read files shared via google-drive-link
# https://stackoverflow.com/questions/62759748/downloading-data-from-a-shared-google-drive-link-in-google-colab

!pip uninstall gdown -y && pip install gdown
!gdown -V

Found existing installation: gdown 5.1.0
Uninstalling gdown-5.1.0:
  Successfully uninstalled gdown-5.1.0
Collecting gdown
  Using cached gdown-5.1.0-py3-none-any.whl (17 kB)
Installing collected packages: gdown
Successfully installed gdown-5.1.0
gdown 4.7.3 at /usr/local/lib/python3.10/dist-packages


In [2]:
# IMPORTS
import numpy as np
import pandas as pd

#Fin Data Sources
import yfinance as yf
import pandas_datareader as pdr

#Data viz
import plotly.graph_objs as go
import plotly.graph_objects as go
import plotly.express as px

import time
from datetime import date

# for graphs
import matplotlib.pyplot as plt

# 0) Dataset for Modeling: Final Preparations

## 0.1) Importing data from Drive & defining variable sets
* automated version need to have a daily updated file/database entries

In [3]:
# truncated
# df = pd.read_parquet("/content/stocks_df_combined_trunc_2014_2023.parquet.brotli", )

# full dataset for 33 stocks
df_full = pd.read_parquet(r"C:\Users\msofy\Downloads\stock-markets-analytics-zoomcamp-2024\03-modeling\stocks_df_combined_2024_05_07.parquet.brotli", )


In [4]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 221142 entries, 0 to 5426
Columns: 202 entries, Open to growth_btc_usd_365d
dtypes: datetime64[ns](3), float64(128), int32(64), int64(5), object(2)
memory usage: 288.5+ MB


In [5]:
df_full.keys()

Index(['Open', 'High', 'Low', 'Close', 'Adj Close_x', 'Volume', 'Ticker',
       'Year', 'Month', 'Weekday',
       ...
       'growth_brent_oil_7d', 'growth_brent_oil_30d', 'growth_brent_oil_90d',
       'growth_brent_oil_365d', 'growth_btc_usd_1d', 'growth_btc_usd_3d',
       'growth_btc_usd_7d', 'growth_btc_usd_30d', 'growth_btc_usd_90d',
       'growth_btc_usd_365d'],
      dtype='object', length=202)

In [6]:
# growth indicators (but not future growth)
GROWTH = [g for g in df_full.keys() if (g.find('growth_')==0)&(g.find('future')<0)]
GROWTH

['growth_1d',
 'growth_3d',
 'growth_7d',
 'growth_30d',
 'growth_90d',
 'growth_365d',
 'growth_dax_1d',
 'growth_dax_3d',
 'growth_dax_7d',
 'growth_dax_30d',
 'growth_dax_90d',
 'growth_dax_365d',
 'growth_snp500_1d',
 'growth_snp500_3d',
 'growth_snp500_7d',
 'growth_snp500_30d',
 'growth_snp500_90d',
 'growth_snp500_365d',
 'growth_dji_1d',
 'growth_dji_3d',
 'growth_dji_7d',
 'growth_dji_30d',
 'growth_dji_90d',
 'growth_dji_365d',
 'growth_epi_1d',
 'growth_epi_3d',
 'growth_epi_7d',
 'growth_epi_30d',
 'growth_epi_90d',
 'growth_epi_365d',
 'growth_gold_1d',
 'growth_gold_3d',
 'growth_gold_7d',
 'growth_gold_30d',
 'growth_gold_90d',
 'growth_gold_365d',
 'growth_wti_oil_1d',
 'growth_wti_oil_3d',
 'growth_wti_oil_7d',
 'growth_wti_oil_30d',
 'growth_wti_oil_90d',
 'growth_wti_oil_365d',
 'growth_brent_oil_1d',
 'growth_brent_oil_3d',
 'growth_brent_oil_7d',
 'growth_brent_oil_30d',
 'growth_brent_oil_90d',
 'growth_brent_oil_365d',
 'growth_btc_usd_1d',
 'growth_btc_usd_3d',


In [7]:
# leaving only Volume ==> generate ln(Volume)
OHLCV = ['Open','High','Low','Close','Adj Close_x','Volume']

In [8]:
CATEGORICAL = ['Month', 'Weekday', 'Ticker', 'ticker_type']

In [9]:
TO_PREDICT = [g for g in df_full.keys() if (g.find('future')>=0)]
TO_PREDICT

['growth_future_5d', 'is_positive_growth_5d_future']

In [10]:
TO_DROP = ['Year','Date','index_x', 'index_y', 'index', 'Quarter','Adj Close_y'] + CATEGORICAL + OHLCV
TO_DROP

['Year',
 'Date',
 'index_x',
 'index_y',
 'index',
 'Quarter',
 'Adj Close_y',
 'Month',
 'Weekday',
 'Ticker',
 'ticker_type',
 'Open',
 'High',
 'Low',
 'Close',
 'Adj Close_x',
 'Volume']

In [11]:
# let's define on more custom numerical features
df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))

In [12]:
# manually defined features
CUSTOM_NUMERICAL = ['SMA10', 'SMA20', 'growing_moving_average', 'high_minus_low_relative','volatility', 'ln_volume']

In [13]:
# All Supported Ta-lib indicators: https://github.com/TA-Lib/ta-lib-python/blob/master/docs/funcs.md

TECHNICAL_INDICATORS = ['adx', 'adxr', 'apo', 'aroon_1','aroon_2', 'aroonosc',
 'bop', 'cci', 'cmo','dx', 'macd', 'macdsignal', 'macdhist', 'macd_ext',
 'macdsignal_ext', 'macdhist_ext', 'macd_fix', 'macdsignal_fix',
 'macdhist_fix', 'mfi', 'minus_di', 'mom', 'plus_di', 'dm', 'ppo',
 'roc', 'rocp', 'rocr', 'rocr100', 'rsi', 'slowk', 'slowd', 'fastk',
 'fastd', 'fastk_rsi', 'fastd_rsi', 'trix', 'ultosc', 'willr',
 'ad', 'adosc', 'obv', 'atr', 'natr', 'ht_dcperiod', 'ht_dcphase',
 'ht_phasor_inphase', 'ht_phasor_quadrature', 'ht_sine_sine', 'ht_sine_leadsine',
 'ht_trendmod', 'avgprice', 'medprice', 'typprice', 'wclprice']

In [14]:
TECHNICAL_PATTERNS = [g for g in df_full.keys() if g.find('cdl')>=0]
print(f'Technical patterns count = {len(TECHNICAL_PATTERNS)}, examples = {TECHNICAL_PATTERNS[0:5]}')


Technical patterns count = 61, examples = ['cdl2crows', 'cdl3blackrows', 'cdl3inside', 'cdl3linestrike', 'cdl3outside']


In [15]:
MACRO = ['gdppot_us_yoy', 'gdppot_us_qoq', 'cpi_core_yoy', 'cpi_core_mom', 'FEDFUNDS',
 'DGS1', 'DGS5', 'DGS10']

In [16]:
NUMERICAL = GROWTH + TECHNICAL_INDICATORS + TECHNICAL_PATTERNS + CUSTOM_NUMERICAL + MACRO

In [17]:
# CHECK: NO OTHER INDICATORS LEFT
OTHER = [k for k in df_full.keys() if k not in OHLCV + CATEGORICAL + NUMERICAL + TO_DROP]
OTHER

['growth_future_5d', 'is_positive_growth_5d_future']

In [18]:
df_full.Ticker.nunique()

33

In [19]:
# tickers, min-max date, count of daily observations
df_full.groupby(['Ticker'])['Date'].agg(['min','max','count'])

Unnamed: 0_level_0,min,max,count
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,1980-12-12,2024-05-07,10941
ACN,2001-07-19,2024-05-07,5736
AMZN,1997-05-15,2024-05-07,6789
ASML,1995-03-15,2024-05-07,7338
AVGO,2009-08-06,2024-05-07,3713
BHARTIARTL.NS,2002-07-01,2024-05-07,5424
BRK-B,1996-05-09,2024-05-07,7046
CDI.PA,1992-01-27,2024-05-07,8328
GOOG,2004-08-19,2024-05-07,4963
HDB,2001-07-20,2024-05-07,5735


In [20]:
# truncated df_full with 25 years of data (and defined growth variables)
df = df_full[df_full.Date>='2000-01-01']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 203 entries, Open to ln_volume
dtypes: datetime64[ns](3), float64(129), int32(64), int64(5), object(2)
memory usage: 239.7+ MB


In [21]:
# let look at the features count and size:
df[NUMERICAL].info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 184 entries, growth_1d to DGS10
dtypes: float64(121), int32(62), int64(1)
memory usage: 214.6 MB


## 0.2) [Code snippet 1] Generating dummies

In [22]:
# what are the categorical features?
CATEGORICAL

['Month', 'Weekday', 'Ticker', 'ticker_type']

In [23]:
# dummy variables are not generated from Date and numeric variables
df.loc[:,'Month'] = df.Month.dt.strftime('%B')
df.loc[:,'Weekday'] = df.Weekday.astype(str)

In [24]:
# Generate dummy variables (no need for bool, let's have int32 instead)
dummy_variables = pd.get_dummies(df[CATEGORICAL], dtype='int32')

Question 1 (1 point): Dummies on Month and Week-of-Month

Find the ABSOLUTE CORRELATION VALUE of the most correlated dummy <month-week_of_month> with the binary outcome variable is_positive_growth_5d_future?

You saw in the correlation analysis and modeling that September and October may be important seasonal months. In this task, we'll go futher and try to generate dummies for Month and Week-of-month (starting from 1). For example, the first week of October should be coded similar to this: 'October_w1'. Once you've generated the new set of variables, find the most correlated (in absolute value) one with is_positive_growth_5d_future and round it to 3 digits after the comma.

Suggested path to a solution:

    [Source] Use this formula to get the week of month for the datetime variable d: (d.day-1)//7+1
    Define a new string variable for all month-week_of_month combinations. Append it to the CATEGORICAL features set. You should have 5 variables treated as CATEGORICAL now: 'Month', 'Weekday', 'Ticker', 'ticker_type', 'month_wom'. In the end, you should get 115 dummy features, including 60 (=12*5) week_month_of_week dummies.
    Use pandas.get_dummies() to generate dummies.
    Use pandas.DataFrame.corr() function (also used in [Code Snippet 1]) to get correlations with is_positive_growth_5d_future, filter out only variables representing the new dummy set, and sort it by absolute values (you can define a new column "abs_corr" in the dataframe with correlations), and find the highest value (among the new dummies features set).

NOTE: new dummies will be used as features in the next tasks, please leave them in the dataset.

In [25]:
# TODO 1: define more categorical features, e.g. all combinations for <September+weekday>  (you'll see that September is actually an important dummy in one of the models)

df['month'] = df['Date'].dt.strftime('%B')
df['week_of_month'] = ((df['Date'].dt.day - 1) // 7 + 1).astype(str)
df['month_wom'] = df['month'] + '_w' + df['week_of_month']
df['month_wom'] 

# Append 'month_wom' to the categorical features set
CATEGORICAL.append('month_wom')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = df['Date'].dt.strftime('%B')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['week_of_month'] = ((df['Date'].dt.day - 1) // 7 + 1).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month_wom'] = df['month'] + '_w' + df['week_of_month']


In [26]:
month_wom = df['month_wom']
month_wom

3490    January_w1
3491    January_w1
3492    January_w1
3493    January_w1
3494    January_w1
           ...    
5422      April_w5
5423        May_w1
5424        May_w1
5425        May_w1
5426        May_w1
Name: month_wom, Length: 182675, dtype: object

In [27]:
month_wom_dummy = pd.get_dummies(month_wom)
correlations = month_wom_dummy.corrwith(df["is_positive_growth_5d_future"])
correlations

April_w1       -0.004023
April_w2       -0.000227
April_w3        0.009226
April_w4       -0.001560
April_w5       -0.008793
August_w1      -0.011796
August_w2      -0.010177
August_w3      -0.005565
August_w4       0.006878
August_w5       0.002611
December_w1    -0.002635
December_w2    -0.008006
December_w3     0.018295
December_w4     0.019270
December_w5    -0.004641
February_w1    -0.000059
February_w2     0.011074
February_w3    -0.024578
February_w4    -0.005811
February_w5    -0.001533
January_w1     -0.006406
January_w2     -0.012696
January_w3     -0.007901
January_w4      0.003551
January_w5     -0.001005
July_w1         0.003738
July_w2         0.012207
July_w3         0.001343
July_w4        -0.001563
July_w5        -0.004213
June_w1        -0.017471
June_w2        -0.005349
June_w3        -0.019754
June_w4         0.013450
June_w5         0.007163
March_w1       -0.009195
March_w2        0.000036
March_w3       -0.001615
March_w4        0.026058
March_w5        0.013572


In [28]:
correlations[[correlations.abs().idxmax()]]

September_w3   -0.034537
dtype: float64

In [29]:
dummy_variables = pd.concat([dummy_variables, month_wom_dummy.astype(int)], axis=1)
dummy_variables.info

<bound method DataFrame.info of       Month_April  Month_August  Month_December  Month_February  \
3490            0             0               0               0   
3491            0             0               0               0   
3492            0             0               0               0   
3493            0             0               0               0   
3494            0             0               0               0   
...           ...           ...             ...             ...   
5422            1             0               0               0   
5423            0             0               0               0   
5424            0             0               0               0   
5425            0             0               0               0   
5426            0             0               0               0   

      Month_January  Month_July  Month_June  Month_March  Month_May  \
3490              1           0           0            0          0   
3491              1  

In [30]:
print(f'We have {len(dummy_variables.columns)} dummy')

We have 115 dummy


In [31]:
# get dummies names in a list
DUMMIES = dummy_variables.keys().to_list()

In [32]:
# Concatenate the dummy variables with the original DataFrame
df_with_dummies = pd.concat([df, dummy_variables], axis=1)

In [33]:
df_with_dummies[NUMERICAL+DUMMIES].info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 299 entries, growth_1d to September_w5
dtypes: float64(121), int32(177), int64(1)
memory usage: 294.8 MB


## 0.4) [Code snippet 3] Temporal split of ~25 years of data (by date)

In [34]:
def temporal_split(df, min_date, max_date, train_prop=0.7, val_prop=0.15, test_prop=0.15):
    """
    Splits a DataFrame into three buckets based on the temporal order of the 'Date' column.

    Args:
        df (DataFrame): The DataFrame to split.
        min_date (str or Timestamp): Minimum date in the DataFrame.
        max_date (str or Timestamp): Maximum date in the DataFrame.
        train_prop (float): Proportion of data for training set (default: 0.6).
        val_prop (float): Proportion of data for validation set (default: 0.2).
        test_prop (float): Proportion of data for test set (default: 0.2).

    Returns:
        DataFrame: The input DataFrame with a new column 'split' indicating the split for each row.
    """
    # Define the date intervals
    train_end = min_date + pd.Timedelta(days=(max_date - min_date).days * train_prop)
    val_end = train_end + pd.Timedelta(days=(max_date - min_date).days * val_prop)

    # Assign split labels based on date ranges
    split_labels = []
    for date in df['Date']:
        if date <= train_end:
            split_labels.append('train')
        elif date <= val_end:
            split_labels.append('validation')
        else:
            split_labels.append('test')

    # Add 'split' column to the DataFrame
    df['split'] = split_labels

    return df

In [35]:
min_date_df = df_with_dummies.Date.min()
max_date_df = df_with_dummies.Date.max()

df_with_dummies = temporal_split(df_with_dummies,
                                 min_date = min_date_df,
                                 max_date = max_date_df)

In [36]:
df_with_dummies['split'].value_counts()/len(df_with_dummies)

split
train         0.675834
test          0.163290
validation    0.160876
Name: count, dtype: float64

In [37]:
# remove the "segmentation" problem (warning message on df performance after many joins and data transformations)
new_df = df_with_dummies.copy()

In [38]:
new_df.groupby(by='split')['growth_future_5d'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
test,29664.0,1.005015,0.040835,0.690219,0.981994,1.004731,1.027028,1.393477
train,123458.0,1.003965,0.053826,0.412383,0.978474,1.003197,1.028354,3.018887
validation,29388.0,1.004417,0.040642,0.668581,0.985343,1.00512,1.023999,1.459217


# 1) Modeling: "rule of thumb" or hand-predictions

## 1.1) Review all the inputs again

In [39]:
# remove the "segmentation" problem (warning message on df performance after many joins and data transformations)
new_df = df_with_dummies.copy()

In [40]:
# Full dataframe (transformed and truncated to 25 years)
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 322 entries, Open to split
dtypes: datetime64[ns](2), float64(129), int32(178), int64(5), object(8)
memory usage: 326.1+ MB


In [41]:
# check one record: it has abs. values, text, and numbers
new_df.head(1)

Unnamed: 0,Open,High,Low,Close,Adj Close_x,Volume,Ticker,Year,Month,Weekday,...,October_w2,October_w3,October_w4,October_w5,September_w1,September_w2,September_w3,September_w4,September_w5,split
3490,58.6875,59.3125,56.0,58.28125,36.065567,53228400.0,MSFT,2000,January,0,...,0,0,0,0,0,0,0,0,0,train


In [42]:
# time split on train/validation/test: FIXED dates of split, approx. 70%, 15%, 15% split
new_df.groupby(['split'])['Date'].agg({'min','max','count'})

Unnamed: 0_level_0,count,min,max
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,29829,2020-09-14,2024-05-07
train,123458,2000-01-03,2017-01-16
validation,29388,2017-01-17,2020-09-11


In [43]:
# what we try to predict
new_df[TO_PREDICT].head(1)

Unnamed: 0,growth_future_5d,is_positive_growth_5d_future
3490,0.963003,0


In [44]:
# to be used as features
new_df[NUMERICAL+DUMMIES].head(1)

Unnamed: 0,growth_1d,growth_3d,growth_7d,growth_30d,growth_90d,growth_365d,growth_dax_1d,growth_dax_3d,growth_dax_7d,growth_dax_30d,...,October_w1,October_w2,October_w3,October_w4,October_w5,September_w1,September_w2,September_w3,September_w4,September_w5
3490,0.998394,0.988341,0.991494,1.372333,1.222951,2.063053,0.970196,0.983855,1.051736,1.134572,...,0,0,0,0,0,0,0,0,0,0


## 1.2) [Code Snippet 3] Manual "hand rule" predictions
* CCI (binary, on technical indicator CCI)
* growth_1d>0
* growth_1d>0 & growth_snp500_1d>0

In [45]:
# why does it work?
# compare a vector (pandas.core.series.Series) with scalar 200 ==> element_wise comparison with the number
new_df.cci>200

3490    False
3491    False
3492    False
3493    False
3494    False
        ...  
5422    False
5423    False
5424    False
5425    False
5426    False
Name: cci, Length: 182675, dtype: bool

In [46]:
# generate manual predictions
# Let's label all prediction features with prefix "pred"
new_df['pred0_manual_cci'] = (new_df.cci>200).astype(int)
new_df['pred1_manual_prev_g1'] = (new_df.growth_1d>1).astype(int)
new_df['pred2_manual_prev_g1_and_snp'] = ((new_df['growth_1d'] > 1) & (new_df['growth_snp500_1d'] > 1)).astype(int)

Question 2 (2 points): Define new "hand" rules on macro and technical indicators variables

What is the precision score for the best of the NEW predictions (pred3 or pred4), rounded to 3 digits after the comma?

Let's utilize the knowledge from the visualised tree (clf10) (Code Snippet 5: 1.4.4 Visualisation):

    You're asked to define two new 'hand' rules (leading to 'positive' subtrees):
        pred3_manual_gdp_fastd: (gdppot_us_yoy <= 0.027) & (fastd >= 0.251)
        pred4_manual_gdp_wti_oil: (gdppot_us_yoy >= 0.027) & (growth_wti_oil_30d <= 1.005)

    Extend the Code Snippet 3 (Manual "hand rule" predictions): Calculate and add new rules (pred3 and pred4) to the dataframe.You should notice that one of the predictions doesn't have any positive predictions on TEST dataset (while it has many on TRAIN+VALIDATION).

    Debug: check in the new_df and the original dataset/data generation process that we didn't make any mistakes during the data transformation step.

    Explain why this can happen even if there are no errors in the data features.

    As a result, write down the precision score for the remaining predictor (round to three decimal points). E.g. if you have 0.57897, your answer should be 0.579.

In [48]:
# TODO 2: find more "hand rules" - can get it from decision trees important factors, or randomly build on other most popular macro/tech indicators/ manual_features

# Define new 'hand' rules based on the visualized tree (clf10)
new_df['pred3_manual_gdp_fastd'] = ((new_df['gdppot_us_yoy'] <= 0.027) & (new_df['fastd'] >= 0.251)).astype(int)
new_df['pred4_manual_gdp_wti_oil'] = ((new_df['gdppot_us_yoy'] >= 0.027) & (new_df['growth_wti_oil_30d'] <= 1.005)).astype(int)

# Check the new_df to ensure the columns are added correctly
new_df[['gdppot_us_yoy', 'fastd', 'growth_wti_oil_30d', 'pred3_manual_gdp_fastd', 'pred4_manual_gdp_wti_oil']].head()


Unnamed: 0,gdppot_us_yoy,fastd,growth_wti_oil_30d,pred3_manual_gdp_fastd,pred4_manual_gdp_wti_oil
3490,0.044886,37.73934,,0,0
3491,0.044886,27.249258,,0,0
3492,0.044886,35.788722,,0,0
3493,0.044886,21.91317,,0,0
3494,0.044886,31.443194,,0,0


In [53]:
# Define new rules
new_df['pred3_manual_gdp_fastd'] = ((new_df['gdppot_us_yoy'] <= 0.027) & (new_df['fastd'] >= 0.251)).astype(int)
new_df['pred4_manual_gdp_wti_oil'] = ((new_df['gdppot_us_yoy'] >= 0.027) & (new_df['growth_wti_oil_30d'] <= 1.005)).astype(int)

In [54]:
from sklearn.metrics import precision_score

# Filter the TEST dataset
test_filter = new_df['split'] == 'test'

# Calculate precision for pred3_manual_gdp_fastd
precision_pred3 = precision_score(new_df[test_filter]['is_positive_growth_5d_future'], new_df[test_filter]['pred3_manual_gdp_fastd'])

# Calculate precision for pred4_manual_gdp_wti_oil
precision_pred4 = precision_score(new_df[test_filter]['is_positive_growth_5d_future'], new_df[test_filter]['pred4_manual_gdp_wti_oil'])

print(f'Precision for pred3_manual_gdp_fastd: {precision_pred3:.3f}')
print(f'Precision for pred4_manual_gdp_wti_oil: {precision_pred4:.3f}')

Precision for pred3_manual_gdp_fastd: 0.555
Precision for pred4_manual_gdp_wti_oil: 0.000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [55]:
best_precision = max(precision_pred3, precision_pred4)
print(f'Best precision score: {best_precision:.3f}')

Best precision score: 0.555


In [56]:
new_df[['cci','growth_1d','growth_snp500_1d','pred0_manual_cci','pred1_manual_prev_g1','pred2_manual_prev_g1_and_snp','is_positive_growth_5d_future']]

Unnamed: 0,cci,growth_1d,growth_snp500_1d,pred0_manual_cci,pred1_manual_prev_g1,pred2_manual_prev_g1_and_snp,is_positive_growth_5d_future
3490,26.847237,0.998394,0.990451,0,0,0,0
3491,-34.319663,0.966220,0.961655,0,0,0,0
3492,-97.318008,1.010544,1.001922,0,1,1,0
3493,-169.947507,0.966502,1.000956,0,0,0,0
3494,-142.142685,1.013068,1.027090,0,1,1,1
...,...,...,...,...,...,...,...
5422,-29.424989,0.988994,0.984269,0,0,0,0
5423,-26.657181,1.001447,1.009128,0,1,1,0
5424,-123.785473,0.972302,1.012557,0,0,0,0
5425,-181.986224,0.989571,1.010326,0,0,0,0


In [57]:
PREDICTIONS = [k for k in new_df.keys() if k.startswith('pred')]
PREDICTIONS

['pred0_manual_cci',
 'pred1_manual_prev_g1',
 'pred2_manual_prev_g1_and_snp',
 'pred3_manual_gdp_fastd',
 'pred4_manual_gdp_wti_oil']

In [58]:
p = PREDICTIONS[0]
part1 = p.split('_')[0] # first prefix before '_'
print(f'Full column name: {p}, only first part: {part1}')

Full column name: pred0_manual_cci, only first part: pred0


In [59]:
# One prediction: do we predict correctly?
new_df['is_correct_prediction'] = (new_df.pred0_manual_cci == new_df.is_positive_growth_5d_future)

In [60]:
new_df[['cci','pred0_manual_cci','is_positive_growth_5d_future','is_correct_prediction']]

Unnamed: 0,cci,pred0_manual_cci,is_positive_growth_5d_future,is_correct_prediction
3490,26.847237,0,0,True
3491,-34.319663,0,0,True
3492,-97.318008,0,0,True
3493,-169.947507,0,0,True
3494,-142.142685,0,1,False
...,...,...,...,...
5422,-29.424989,0,0,True
5423,-26.657181,0,0,True
5424,-123.785473,0,0,True
5425,-181.986224,0,0,True


In [61]:
# check "Precision" : the percentage of "correct" predictions , WHEN we predict "1" (POSITIVE future growth)
filter = (new_df.split=='test') & (new_df.pred0_manual_cci==1)
new_df[filter].is_correct_prediction.value_counts()


is_correct_prediction
True     455
False    344
Name: count, dtype: int64

In [62]:
# %% of correct predictions : 54%
new_df[filter].is_correct_prediction.value_counts() / len(new_df[filter])

is_correct_prediction
True     0.569462
False    0.430538
Name: count, dtype: float64

In [63]:
# delete this column
del new_df["is_correct_prediction"]

In [64]:
# generate columns is_correct_
for pred in PREDICTIONS:
  part1 = pred.split('_')[0] # first prefix before '_'
  new_df[f'is_correct_{part1}'] =  (new_df[pred] == new_df.is_positive_growth_5d_future).astype(int)

In [65]:
# IS_CORRECT dataset
IS_CORRECT =  [k for k in new_df.keys() if k.startswith('is_correct_')]
IS_CORRECT

['is_correct_pred0',
 'is_correct_pred1',
 'is_correct_pred2',
 'is_correct_pred3',
 'is_correct_pred4']

In [66]:
new_df[PREDICTIONS+IS_CORRECT+['is_positive_growth_5d_future']]

Unnamed: 0,pred0_manual_cci,pred1_manual_prev_g1,pred2_manual_prev_g1_and_snp,pred3_manual_gdp_fastd,pred4_manual_gdp_wti_oil,is_correct_pred0,is_correct_pred1,is_correct_pred2,is_correct_pred3,is_correct_pred4,is_positive_growth_5d_future
3490,0,0,0,0,0,1,1,1,1,1,0
3491,0,0,0,0,0,1,1,1,1,1,0
3492,0,1,1,0,0,1,0,0,1,1,0
3493,0,0,0,0,0,1,1,1,1,1,0
3494,0,1,1,0,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
5422,0,0,0,1,0,1,1,1,0,1,0
5423,0,1,1,1,0,1,0,0,0,1,0
5424,0,0,0,1,0,1,1,1,0,1,0
5425,0,0,0,1,0,1,1,1,0,1,0


In [67]:
len(new_df[new_df.split=='test'])

29829

In [68]:
# define "Precision" for ALL predictions on a Test dataset (~4 last years of trading)
for i,column in enumerate(IS_CORRECT):
  prediction_column = PREDICTIONS[i]
  is_correct_column = column
  filter = (new_df.split=='test') & (new_df[prediction_column]==1)
  print(f'Prediction column:{prediction_column} , is_correct_column: {is_correct_column}')
  print(new_df[filter][is_correct_column].value_counts())
  print(new_df[filter][is_correct_column].value_counts()/len(new_df[filter]))

  print('---------')

Prediction column:pred0_manual_cci , is_correct_column: is_correct_pred0
is_correct_pred0
1    455
0    344
Name: count, dtype: int64
is_correct_pred0
1    0.569462
0    0.430538
Name: count, dtype: float64
---------
Prediction column:pred1_manual_prev_g1 , is_correct_column: is_correct_pred1
is_correct_pred1
1    8621
0    6980
Name: count, dtype: int64
is_correct_pred1
1    0.552593
0    0.447407
Name: count, dtype: float64
---------
Prediction column:pred2_manual_prev_g1_and_snp , is_correct_column: is_correct_pred2
is_correct_pred2
1    5726
0    4729
Name: count, dtype: int64
is_correct_pred2
1    0.547681
0    0.452319
Name: count, dtype: float64
---------
Prediction column:pred3_manual_gdp_fastd , is_correct_column: is_correct_pred3
is_correct_pred3
1    16560
0    13262
Name: count, dtype: int64
is_correct_pred3
1    0.555295
0    0.444705
Name: count, dtype: float64
---------
Prediction column:pred4_manual_gdp_wti_oil , is_correct_column: is_correct_pred4
Series([], Name: coun

## 1.4) [Code Snippet 5] Binary Decision Tree

In [69]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

### 1.4.1) Define dataframes AND perform data cleaning
* define X_train (dataframe), X_test (dataframe), y_train (series), y_test (series)
* replace +-inf. with 0
* fill NaNs with 0 (you can drop it too, but will loose a lot of data in our case
* remove 1-2% outliers (in each dimension, or only in variable to_predict :: we won't use it for a Decision Tree

In [70]:
# Decision Tree doesn't like too large and inf. values
import numpy as np

def remove_infinite_values(X):
    """
    Remove infinite values from the input array.

    Parameters:
    - X: Input array (NumPy array or array-like)

    Returns:
    - Array with infinite values removed
    """
    return X[np.isfinite(X).all(axis=1)]

# Example usage:
# Assuming X is your input data
# filtered_X = remove_infinite_values(X)

In [71]:
# Split the data into training and testing sets based on the split date
features_list = NUMERICAL+DUMMIES
to_predict = 'is_positive_growth_5d_future'

train_df = new_df[new_df.split.isin(['train','validation'])].copy(deep=True)
test_df = new_df[new_df.split.isin(['test'])].copy(deep=True)

# ONLY numerical Separate features and target variable for training and testing sets
# need Date and Ticker later when merging predictions to the dataset
X_train = train_df[features_list+[to_predict,'Date','Ticker']]
X_test = test_df[features_list+[to_predict,'Date','Ticker']]

print(f'length: X_train {X_train.shape},  X_test {X_test.shape}')


length: X_train (152846, 302),  X_test (29829, 302)


In [72]:
# Can't have +-inf values . E.g. ln(volume)=-inf when volume==0 => substitute with 0

# Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Need to fill NaNs somehow
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

print(f'length: X_train_imputed {X_train.shape},  X_test_imputed {X_test.shape}')

length: X_train_imputed (152846, 302),  X_test_imputed (29829, 302)


In [73]:
# you may want to remove 1-2% outliers based on percentile ==> not used here in Decision Trees
def remove_outliers_percentile(X, lower_percentile=1, upper_percentile=99):
    """
    Remove outliers from the input array based on percentiles.

    Parameters:
    - X: Input array (NumPy array or array-like)
    - lower_percentile: Lower percentile threshold (float, default=1)
    - upper_percentile: Upper percentile threshold (float, default=99)

    Returns:
    - Array with outliers removed
    """
    lower_bound = np.percentile(X, lower_percentile, axis=0)
    upper_bound = np.percentile(X, upper_percentile, axis=0)
    mask = np.logical_and(np.all(X >= lower_bound, axis=1), np.all(X <= upper_bound, axis=1))
    return X[mask]

# Example usage:
# Assuming X is your input data
# filtered_X = remove_outliers_percentile(X, lower_percentile=1, upper_percentile=99)

In [74]:
X_train_imputed = X_train # we won't use outliers removal to save more data to train: remove_outliers_percentile(X_train)
X_test_imputed = X_test # we won't use outliers removal to save more data to test: remove_outliers_percentile(X_test)

In [75]:
# same shape
print(f'length: X_train_imputed {X_train_imputed.shape},  X_test_imputed {X_test_imputed.shape}')

length: X_train_imputed (152846, 302),  X_test_imputed (29829, 302)


In [76]:
y_train = X_train_imputed[to_predict]
y_test = X_test_imputed[to_predict]

# remove y_train, y_test from X_ dataframes
del X_train_imputed[to_predict]
del X_test_imputed[to_predict]

### 1.4.2 Estimation of a Decision Tree model

In [78]:
# INPUTS:
# X_train_imputed : CLEAN dataFrame with only numerical features (train+validation periods)
# X_test_imputed : CLEAN dataFrame with only numerical features (test periods)

# y_train : true values for the train period
# y_test  : true values for the test period

In [119]:
# estimation/fit function (using dataframe of features X and what to predict y) --> optimising total accuracy
# max_depth is hyperParameter
def fit_decision_tree(X, y, max_depth=20):
# Initialize the Decision Tree Classifier
  clf = DecisionTreeClassifier(max_depth=max_depth)

  # Fit the classifier to the training data
  clf.fit(X, y)
  return clf, X.columns

In [120]:
%%time
# drop 2 columns before fitting the tree, but we need those columns later for joins
clf_20, train_columns = fit_decision_tree(X=X_train_imputed.drop(['Date','Ticker'],axis=1),
                           y=y_train,
                           max_depth=20)

CPU times: total: 21.5 s
Wall time: 38.7 s


In [121]:
%%time
clf_10, train_columns = fit_decision_tree(X=X_train_imputed.drop(['Date','Ticker'],axis=1),
                           y=y_train,
                           max_depth=10)

CPU times: total: 12 s
Wall time: 21.1 s


In [122]:
# TODO 3: TRAIN only on train dataset, experiment with trees with depth 1..20 --> find the best one on VALID dataset
#       for the "best" tree model: find precision on the TEST set

### 1.4.3 Inference for a Decision Tree

In [123]:
def predict_decision_tree(clf:DecisionTreeClassifier, df_X:pd.DataFrame, y_true: pd.Series):
  # Predict the target variable on the test data
  y_pred = clf.predict(df_X)

  max_depth = clf.tree_.max_depth
  # Print the maximum depth
  print("Maximum depth of the decision tree:", max_depth)

  # Calculate the accuracy/precision of the model
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  print(f'Accuracy ={accuracy}, precision = {precision}')

  # resulting df
  result_df = pd.concat([df_X, y_true, pd.Series(y_pred, index=df_X.index, name='pred_')], axis=1)

  return result_df

In [124]:
pred20 = predict_decision_tree(clf_20, X_test_imputed.drop(['Date','Ticker'],axis=1), y_test)

Maximum depth of the decision tree: 20
Accuracy =0.5217070635958295, precision = 0.5642649036306588


In [125]:
# Predictions of a decision tree of depth "20"
pred20.pred_.value_counts()

pred_
1    17848
0    11981
Name: count, dtype: int64

In [126]:
pred10 = predict_decision_tree(clf_10, X_test_imputed.drop(['Date','Ticker'],axis=1), y_test)

Maximum depth of the decision tree: 10
Accuracy =0.5567736095745751, precision = 0.5651936289818863


In [127]:
pred10.tail()

Unnamed: 0,growth_1d,growth_3d,growth_7d,growth_30d,growth_90d,growth_365d,growth_dax_1d,growth_dax_3d,growth_dax_7d,growth_dax_30d,...,October_w3,October_w4,October_w5,September_w1,September_w2,September_w3,September_w4,September_w5,is_positive_growth_5d_future,pred_
5422,0.988994,0.98412,1.021587,1.015755,1.046955,1.793402,0.989726,1.000831,1.010983,0.99975,...,0,0,0,0,0,0,0,0,0,1
5423,1.001447,0.998419,0.996346,0.997589,1.031967,1.790292,0.998011,0.985435,1.001999,0.997982,...,0,0,0,0,0,0,0,0,0,1
5424,0.972302,0.962991,0.969433,0.989441,1.002348,1.738504,1.005873,0.993558,0.992499,1.000784,...,0,0,0,0,0,0,0,0,0,1
5425,0.989571,0.963553,0.952804,0.972509,0.989811,1.732523,1.009644,1.013553,1.004783,1.008886,...,0,0,0,0,0,0,0,0,0,1
5426,0.989735,0.952285,0.938518,0.978882,1.002706,1.737153,1.014021,1.029813,1.028619,1.013796,...,0,0,0,0,0,0,0,0,0,1


In [128]:
X_test_imputed.join(pred10['pred_']).head()

Unnamed: 0,growth_1d,growth_3d,growth_7d,growth_30d,growth_90d,growth_365d,growth_dax_1d,growth_dax_3d,growth_dax_7d,growth_dax_30d,...,October_w4,October_w5,September_w1,September_w2,September_w3,September_w4,September_w5,Date,Ticker,pred_
0,0.0,0.0,0.0,0.0,0.0,0.0,1.015866,1.032483,1.037417,0.981965,...,0,0,0,0,0,0,0,2022-05-17,LICI.NS,1
1,1.001257,0.0,0.0,0.0,0.0,0.0,0.98744,0.998562,1.046865,0.964844,...,0,0,0,0,0,0,0,2022-05-18,LICI.NS,1
2,0.959491,0.0,0.0,0.0,0.0,0.0,0.991044,0.994122,1.025679,0.96242,...,0,0,0,0,0,0,0,2022-05-19,LICI.NS,0
3,0.982518,0.943902,0.0,0.0,0.0,0.0,1.007175,0.985617,1.011084,0.988003,...,0,0,0,0,0,0,0,2022-05-20,LICI.NS,1
4,0.988743,0.932105,0.0,0.0,0.0,0.0,1.013839,1.011968,1.031716,1.006908,...,0,0,0,0,0,0,0,2022-05-23,LICI.NS,1


In [129]:
# Predictions of a decision tree of depth "10" : many more "positive" predictions
pred10.pred_.value_counts()

pred_
1    25616
0     4213
Name: count, dtype: int64

In [130]:
# define a new DF with the SAME index (used for joins)
pred20_df = pred20[['pred_']].rename(columns={'pred_': 'pred_tree_clf20'})
pred20_df.head(1)

Unnamed: 0,pred_tree_clf20
8697,0


In [131]:
# define a new DF with the SAME index (used for joins)
pred10_df = pred10[['pred_']].rename(columns={'pred_': 'pred_tree_clf10'})
pred10_df.head(1)

Unnamed: 0,pred_tree_clf10
8697,1


### 1.4.4 Features Importance and Tree Visualisation of top levels (for clf10)

In [96]:
# Feautures importance function to predict future returns (based on the classifier)
# get feature importance from 'clf' (classifier) and 'train_columns' (column names)

def get_importances(clf, train_columns):
  # Assuming clf is your trained DecisionTreeClassifier
  feature_importance = clf.feature_importances_

  # Assuming X_train is your training features
  feature_names = train_columns

  # Create a DataFrame to store feature importance
  feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

  # Sort the DataFrame by importance in descending order
  feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

  # Print or display the feature importance DataFrame
  # print(feature_importance_df)
  return feature_importance_df

In [132]:
get_importances(clf_10, train_columns).head(10)

Unnamed: 0,Feature,Importance
41,growth_wti_oil_365d,0.059356
35,growth_gold_365d,0.04969
178,cpi_core_yoy,0.037471
183,DGS10,0.03606
22,growth_dji_90d,0.035342
17,growth_snp500_365d,0.034703
39,growth_wti_oil_30d,0.025679
177,gdppot_us_qoq,0.025175
33,growth_gold_30d,0.025114
292,October_w4,0.02388


In [133]:
get_importances(clf_20, train_columns).head(10)

Unnamed: 0,Feature,Importance
35,growth_gold_365d,0.021256
5,growth_365d,0.020509
97,natr,0.019369
41,growth_wti_oil_365d,0.018171
95,obv,0.016472
39,growth_wti_oil_30d,0.016265
98,ht_dcperiod,0.016137
55,adxr,0.01595
178,cpi_core_yoy,0.015799
90,trix,0.015667


### 1.4.5 Merge with the original df for predictions (only when predicted on test dataset)

In [134]:
# current predictions from MANUAL
PREDICTIONS

['pred0_manual_cci',
 'pred1_manual_prev_g1',
 'pred2_manual_prev_g1_and_snp',
 'pred3_manual_gdp_fastd',
 'pred4_manual_gdp_wti_oil']

In [135]:
new_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close_x,Volume,Ticker,Year,Month,Weekday,...,pred0_manual_cci,pred1_manual_prev_g1,pred2_manual_prev_g1_and_snp,pred3_manual_gdp_fastd,pred4_manual_gdp_wti_oil,is_correct_pred0,is_correct_pred1,is_correct_pred2,is_correct_pred3,is_correct_pred4
3490,58.6875,59.3125,56.0,58.28125,36.065567,53228400.0,MSFT,2000,January,0,...,0,0,0,0,0,1,1,1,1,1
3491,56.78125,58.5625,56.125,56.3125,34.847271,54119000.0,MSFT,2000,January,1,...,0,0,0,0,0,1,1,1,1,1
3492,55.5625,58.1875,54.6875,56.90625,35.214706,64059600.0,MSFT,2000,January,2,...,0,1,1,0,0,1,0,0,1,1
3493,56.09375,56.9375,54.1875,55.0,34.035072,54976600.0,MSFT,2000,January,3,...,0,0,0,0,0,1,1,1,1,1
3494,54.3125,56.125,53.65625,55.71875,34.479843,62013600.0,MSFT,2000,January,4,...,0,1,1,0,0,0,1,1,0,0


In [136]:
# index in df is not unique
np.sort(new_df.groupby(new_df.index).split.count())

array([ 1,  1,  1, ..., 27, 27, 27], dtype=int64)

In [137]:
# it's hard to join with pred10_df - as index is totally different
pred10_df.head()

Unnamed: 0,pred_tree_clf10
8697,1
8698,1
8699,1
8700,1
8701,1


Question 3 (1 point): Unique correct predictions from a 10-levels deep Decision Tree Classifier (pred5_clf_10)

What is the total number of records in the TEST dataset when the new prediction pred5_clf_10 is better than all 'hand' rules (pred0..pred4)?

NOTE: please include random_state=42 to Decision Tree Classifier init function (line clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)) to ensure everyone gets the same results.

Suggested solution:

    Step1: Rewrite the '1.4.3 Inference for a decision tree' piece for the Decision Tree Classifier with max_depth=10 (clf_10), so that you fit the model on TRAIN+VALIDATION sets (unchanged from the lecture), but predict on the whole set X_all (to be able to define a new column 'pred5_clf_10' in the dataframe new_df). Here is the link with explanation. It will solve the problem in 1.4.5 when predictions were made only for Test dataset and couldn't be easily joined with the full dataset.

    Step2: Once you have it, define a new column 'only_pred5_is_correct' similar to 'hand' prediction rules with several conditions: is_positive_growth_5d_future AND is_correct_pred5 should be equal 1, while all other predictions is_correct_pred0..is_correct_pred4 should be equal to 0.

    Step3: Convert 'only_pred5_is_correct' column from bool to int, and find how many times it is equal to 1 in the TEST set. Write down this as an answer.

ADVANCED: define a function that can be applied to the whole row of predictions (a few examples of pandas-apply-row-functions) and can find whether some prediction 'predX' (where X is one of the predictions) is uniquely correct. It should work even if there are 100 predictions available, so that you don't define manually the condition for 'predX'.

In [139]:
# Make predictions on the TEST dataset using clf_10
test_predictions = clf_10.predict(X_test_imputed.drop(['Date', 'Ticker'], axis=1))
pred_series = pd.Series(test_predictions, dtype=int)

# Create a mask for the TEST dataset
test_mask = new_df["split"] == "test"

# Add the new predictions to new_df
new_df.loc[test_mask, "pred5_clf_10"] = pred_series

# Extract columns for pred0 to pred4 for comparison
pred0_to_4 = new_df.loc[test_mask, new_df.columns.str.contains(r"^pred[0-4]")]

# Identify where all 'hand' rules fail
manual_all_fail = (pred0_to_4.values != new_df.loc[test_mask, to_predict].values[:, np.newaxis]).all(axis=1)

# Identify where pred5_clf_10 is correct
pred5_correct = new_df.loc[test_mask, to_predict].values == new_df.loc[test_mask, "pred5_clf_10"].values

# Count the records where all 'hand' rules fail but pred5_clf_10 is correct
better_pred5_count = (manual_all_fail & pred5_correct).sum()

print(f'Total number of records where pred5_clf_10 is better than all "hand" rules: {better_pred5_count}')


Total number of records where pred5_clf_10 is better than all "hand" rules: 1


In [134]:
# TODO 4: JOIN predictions with the original dataframe (define a new column):
#  so, that there are columns pred_tree_clf10 AND pred_tree_clf20

Question 4: (2 points) Hyperparameter tuning for a Decision Tree

What is the optimal tree depth (from 1 to 20) for a DecisionTreeClassifier?

NOTE: please include random_state=42 to Decision Tree Classifier init function (line clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)) to ensure consistency in results.

Follow these steps to find the optimal max_depth:

    Iterate through max_depth values from 1 to 20.
    Train the Decision Tree Classifier with the current max_depth parameter.
    Optionally, visualize how the 'head' of each fitted tree changes with more advanced (=deep) trees. You can use the sklearn.tree.plot_tree() function, or the compact way with the export_text() functionality (Stack Overflow example):

    from sklearn.tree import export_text
    tree_rules = export_text(model, feature_names=list(X_train), max_depth=3)
    print(tree_rules)

    Calculate the precision score (you can use the function sklearn.metrics.precision_score()) on the TEST dataset for each of the fitted trees. You can also compare it with the precision score on a VALIDATION dataset, which is included to the training phase (to have more data to train on). You should see that the precision score on a VALIDATION set starts to grow with the complexity of a tree (overfit), which isn't seen on the precision score on TEST.
    Identify the optimal max_depth, where the precision score is the highest on the TEST dataset. Record this value as best_max_depth and submit as an answer.
    Make predictions on all records (TRAIN+VALIDATION+TEST) and add the new prediction pred6_clf_best to the dataframe new_df.

Additionally, compare the precision score of the tuned decision tree with previous predictions. You should observe an improvement (>0.58, or more than 58% precision), indicating that the tuned tree outperforms previous manual "hand" rules and Decision Tree predictions.

ADVANCED: Read more about different aspects of scikit-learn Decision Trees. Draw a line of precision/accuracy vs. max_depth and note whether there's a saturation point of precision/accuracy as max_depth increases. In theory, there should be a trade-off between better fitting (=more complex trees) and generalization.

In [147]:
# Extract only numerical features and the target variable for training and testing sets
X_train = new_df.loc[new_df["split"].isin(['train','validation']), features_list]
y_train = new_df.loc[new_df["split"].isin(['train','validation']), to_predict]
X_test = new_df.loc[new_df["split"] == "test", features_list]
y_test = new_df.loc[new_df["split"] == "test", to_predict]

# Handling infinite values and NaNs
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Initialize arrays to store accuracies and precisions for different max_depth values
accuracy = np.zeros((20, ))
precision = np.zeros((20, ))

# Iterate through max_depth values from 1 to 20
for i, max_depth in enumerate(range(1, 21)):
    print(f"Evaluating max_depth={max_depth}")
    
    # Train the Decision Tree Classifier with the current max_depth parameter
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    clf.fit(X_train, y_train)
    
    # Make predictions on the test dataset
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy and precision scores
    accuracy[i] = accuracy_score(y_test, y_pred)
    precision[i] = precision_score(y_test, y_pred)
    
    # Print accuracy and precision scores for analysis
    print(f"Accuracy: {accuracy[i]:.3f}\t Precision: {precision[i]:.3f}")

# Return accuracies, precisions, and the optimal max_depth
accuracy, precision, range(1, 21)[precision.argmax()]


Evaluating max_depth=1
Accuracy: 0.555	 Precision: 0.555
Evaluating max_depth=2
Accuracy: 0.555	 Precision: 0.555
Evaluating max_depth=3
Accuracy: 0.555	 Precision: 0.555
Evaluating max_depth=4
Accuracy: 0.555	 Precision: 0.555
Evaluating max_depth=5
Accuracy: 0.556	 Precision: 0.555
Evaluating max_depth=6
Accuracy: 0.568	 Precision: 0.571
Evaluating max_depth=7
Accuracy: 0.565	 Precision: 0.568
Evaluating max_depth=8
Accuracy: 0.565	 Precision: 0.568
Evaluating max_depth=9
Accuracy: 0.567	 Precision: 0.570
Evaluating max_depth=10
Accuracy: 0.557	 Precision: 0.565
Evaluating max_depth=11
Accuracy: 0.552	 Precision: 0.567
Evaluating max_depth=12
Accuracy: 0.547	 Precision: 0.572
Evaluating max_depth=13
Accuracy: 0.549	 Precision: 0.571
Evaluating max_depth=14
Accuracy: 0.547	 Precision: 0.584
Evaluating max_depth=15
Accuracy: 0.560	 Precision: 0.586
Evaluating max_depth=16
Accuracy: 0.540	 Precision: 0.572
Evaluating max_depth=17
Accuracy: 0.537	 Precision: 0.577
Evaluating max_depth=18

(array([0.55519796, 0.55536558, 0.55519796, 0.55526501, 0.55560025,
        0.56776962, 0.56502062, 0.56502062, 0.56699856, 0.5570418 ,
        0.5516779 , 0.54658219, 0.54859365, 0.54705153, 0.55972376,
        0.54024607, 0.5368601 , 0.53169734, 0.52650106, 0.52797613]),
 array([0.55519796, 0.55529475, 0.55519796, 0.55523519, 0.55546233,
        0.57117355, 0.56839335, 0.56839335, 0.56999117, 0.56527843,
        0.56703676, 0.57185459, 0.57073015, 0.5844969 , 0.58644341,
        0.5720942 , 0.57668677, 0.56821053, 0.56379247, 0.57304363]),
 15)

[EXPLORATORY] Question 5: What data is missing?

Now that you have some insights from the correlation analysis and the Decision Trees regarding the most influential variables, suggest new indicators you would like to include in the dataset and explain why.

You can also propose something entirely different based on your intuition, but it should be relevant to the shared dataset of the largest Indian, EU, and US stocks. If you choose this approach, please specify the data source as well.