# **Models for forecasting tax avoidance rates**

* **Decision Tree (DT)**
* **Random Forest (RF)**
* **Elastic Net (EN)**

Student: Thuan Tran

#**1. Dependencies loading**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import joblib
import os

from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer, r2_score


pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 150)

#**2. Data loading**

In [None]:
train_url = "https://raw.githubusercontent.com/michaelwozniak/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/main/data/output/train_fe.csv"
features_url = "https://raw.githubusercontent.com/michaelwozniak/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/main/data/output/feature_ranking.xlsx"

df = pd.read_csv(train_url, index_col=0)
fr = pd.read_excel(features_url, index_col=0)

print("Data shape:", df.shape)
df.head()


Data shape: (3993, 115)


Unnamed: 0,Ticker,Nazwa2,rok,ta,txt,pi,str,xrd,ni,ppent,intant,dlc,dltt,capex,revenue,cce,adv,etr,diff,roa,lev,intan,rd,ppe,sale,cash_holdings,adv_expenditure,capex2,cfc,dta,capex2_scaled,y_v2x_polyarchy,y_e_p_polity,y_BR_Democracy,WB_GDPgrowth,WB_GDPpc,WB_Inflation,rr_per_country,rr_per_sector,sektor_consumer discretionary,sektor_consumer staples,sektor_energy,sektor_health care,sektor_industrials,sektor_materials,sektor_real estate,sektor_technology,sektor_utilities,gielda_2,gielda_3,gielda_4,gielda_5,ta_log,"txt_cat_(-63.011, -34.811]","txt_cat_(-34.811, 0.488]","txt_cat_(0.488, 24.415]","txt_cat_(24.415, 25.05]","txt_cat_(25.05, 308.55]","txt_cat_(308.55, 327.531]","txt_cat_(327.531, inf]","pi_cat_(-8975.0, -1.523]","pi_cat_(-1.523, 157.119]","pi_cat_(157.119, 465.9]","pi_cat_(465.9, 7875.5]","pi_cat_(7875.5, 8108.5]","pi_cat_(8108.5, inf]","str_cat_(0.0875, 0.192]","str_cat_(0.192, 0.28]","str_cat_(0.28, inf]",xrd_exists,ni_profit,ni_profit_20000,ppent_sqrt,intant_sqrt,"dlc_cat_(42.262, 176.129]","dlc_cat_(176.129, 200.9]","dlc_cat_(200.9, inf]","dltt_cat_(39.38, 327.85]","dltt_cat_(327.85, 876.617]","dltt_cat_(876.617, inf]","capex_cat_(7.447, 79.55]","capex_cat_(79.55, 5451.0]","capex_cat_(5451.0, inf]","revenue_cat_(0.174, 1248.817]","revenue_cat_(1248.817, 4233.587]","revenue_cat_(4233.587, inf]","cce_cat_(5.619, 63.321]","cce_cat_(63.321, inf]","adv_cat_(0.3, 874.5]","adv_cat_(874.5, inf]",diff_positive,roa_clip,lev_sqrt,intan_pow2,rd_sqrt,ppe_clip,cash_holdings_sqrt,adv_expenditure_positive,diff_dta,cfc_dta,etr_y_past,etr_y_ma,diff_ma,roa_ma,lev_ma,intan_ma,ppe_ma,sale_ma,cash_holdings_ma,roa_past,lev_past,intan_past,ppe_past,sale_past,cash_holdings_past
0,11B PW Equity,11 bit studios SA,2005,21.127613,1.24185,6.329725,0.19,0.0,5.0879,0.276275,4.1959,0.0,0.0,2.223413,11.873301,12.142975,0.0,0.196193,-0.006193,0.240818,0.0,0.198598,0.0,0.013076,0.445954,0.574744,0.0,8.047824,0,0,0.007469,0.877,10,1,3.493668,8021.003655,2.183799,0.334868,0.03001,0,0,0,0,0,0,0,0,0,0,0,0,0,3.050581,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0.525619,2.04839,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0.240818,0.0,0.039441,0.0,0.013076,0.758119,0,-0.0,0,0.196193,0.196193,-0.006193,0.240818,0.0,0.198598,0.013076,0.445954,0.574744,0.240818,0.0,0.198598,0.013076,0.445954,0.574744
1,11B PW Equity,11 bit studios SA,2006,21.127613,1.24185,6.329725,0.19,0.0,5.0879,0.276275,4.1959,0.0,0.0,2.223413,11.873301,12.142975,0.0,0.196193,-0.006193,0.240818,0.0,0.198598,0.0,0.013076,0.445954,0.574744,0.0,8.047824,0,0,0.007469,0.884,10,1,6.179641,9038.730847,1.284694,0.421396,0.136194,0,0,0,0,0,0,0,0,0,0,0,0,0,3.050581,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0.525619,2.04839,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0.240818,0.0,0.039441,0.0,0.013076,0.758119,0,-0.0,0,0.196193,0.196193,-0.006193,0.240818,0.0,0.198598,0.013076,0.445954,0.574744,0.240818,0.0,0.198598,0.013076,0.445954,0.574744
2,11B PW Equity,11 bit studios SA,2007,21.127613,1.24185,6.329725,0.19,0.0,5.0879,0.276275,4.1959,0.0,0.0,2.223413,11.873301,12.142975,0.0,0.196193,-0.006193,0.240818,0.0,0.198598,0.0,0.013076,0.445954,0.574744,0.0,8.047824,0,0,0.007469,0.887,10,1,7.034828,11255.4367,2.458743,0.10422,0.035493,0,0,0,0,0,0,0,0,0,0,0,0,0,3.050581,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0.525619,2.04839,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0.240818,0.0,0.039441,0.0,0.013076,0.758119,0,-0.0,0,0.196193,0.196193,-0.006193,0.240818,0.0,0.198598,0.013076,0.445954,0.574744,0.240818,0.0,0.198598,0.013076,0.445954,0.574744
3,11B PW Equity,11 bit studios SA,2008,21.127613,1.24185,6.329725,0.19,0.0,5.0879,0.276275,4.1959,0.0,0.0,2.223413,11.873301,12.142975,0.0,0.196193,-0.006193,0.240818,0.0,0.198598,0.0,0.013076,0.445954,0.574744,0.0,8.047824,0,0,0.007469,0.869,10,1,4.249609,14001.44688,4.164972,-0.511178,-0.384695,0,0,0,0,0,0,0,0,0,0,0,0,0,3.050581,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0.525619,2.04839,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0.240818,0.0,0.039441,0.0,0.013076,0.758119,0,-0.0,0,0.196193,0.196193,-0.006193,0.240818,0.0,0.198598,0.013076,0.445954,0.574744,0.240818,0.0,0.198598,0.013076,0.445954,0.574744
4,11B PW Equity,11 bit studios SA,2009,21.127613,1.24185,6.329725,0.19,0.0,5.0879,0.276275,4.1959,0.0,0.0,2.223413,11.873301,12.142975,0.0,0.188487,-0.006193,0.240818,0.0,0.198598,0.0,0.013076,0.445954,0.574744,0.0,8.047824,0,0,0.007469,0.878,10,1,2.82026,11527.59323,3.795392,0.429716,0.234897,0,0,0,0,0,0,0,0,0,0,0,0,0,3.050581,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0.525619,2.04839,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0.240818,0.0,0.039441,0.0,0.013076,0.758119,0,-0.0,0,0.196193,0.196193,-0.006193,0.240818,0.0,0.198598,0.013076,0.445954,0.574744,0.240818,0.0,0.198598,0.013076,0.445954,0.574744


#**3. Feature engineering**

In [None]:
df = df.drop(columns=["Ticker", "Nazwa2", "rok"], errors="ignore")

y = df["etr"]
X = df.drop(columns=["etr"])

print("Target variable:", y.name)
print("Features shape:", X.shape)


Target variable: etr
Features shape: (3993, 111)


We will use the top 20 variables from Mutual Information ranking

In [None]:
fr = fr.sort_values("mi_score", ascending=False)
top_features = fr.index.tolist()[:20]
X = X[top_features]

print("Selected top 20 features:")
print(top_features)

Selected top 20 features:
['etr_y_past', 'etr_y_ma', 'txt', 'diff', 'ni', 'pi', 'intant', 'intant_sqrt', 'ta', 'revenue', 'roa', 'roa_clip', 'diff_ma', 'capex', 'dlc', 'ta_log', 'cce', 'intan_past', 'dltt', 'sale']


# **4. Cross-validation setup**

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(r2_score)

#**5. Decision Tree**

In [None]:
dt = DecisionTreeRegressor(random_state=42)

param_dt = {
    "max_depth": [3, 5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_dt = GridSearchCV(
    dt, param_dt, cv=kf, scoring="r2", n_jobs=-1, return_train_score=True
)
grid_dt.fit(X, y)

print("Best Decision Tree params:", grid_dt.best_params_)
print("Best CV R2:", grid_dt.best_score_)

Best Decision Tree params: {'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best CV R2: 0.13716386710701264


No errors during transformation

In [None]:
os.makedirs("../models", exist_ok=True)
best_dt = grid_dt.best_estimator_
pickle.dump(best_dt, open("../models/decision_tree_best.pkl", "wb"))
print("Decision Tree model saved successfully!")

Decision Tree model saved successfully!


#**6. Random Forest**


In [None]:
rf = RandomForestRegressor(random_state=42)

param_rf = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid_rf = GridSearchCV(
    rf, param_rf, cv=3, scoring="r2", n_jobs=-1, return_train_score=True
)
grid_rf.fit(X, y)

print("Best Random Forest params:", grid_rf.best_params_)
print("Best CV R2:", grid_rf.best_score_)


Best Random Forest params: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best CV R2: 0.15517360937526217


The model is trained, tuned, and performing well

In [None]:
best_rf = grid_rf.best_estimator_
os.makedirs("../models", exist_ok=True)
pickle.dump(best_rf, open("../models/random_forest_best.pkl", "wb"))
print("Random Forest model saved successfully!")


Random Forest model saved successfully!


**7. Elastic Net**

In [None]:
pipe_en = Pipeline([
    ("scaler", StandardScaler()),
    ("en", ElasticNet(max_iter=10000, random_state=42))
])

param_en = {
    "en__alpha": [0.01, 0.1, 1, 10],
    "en__l1_ratio": [0.1, 0.5, 0.9]
}

grid_en = GridSearchCV(
    pipe_en, param_en, cv=kf, scoring="r2", n_jobs=-1, return_train_score=True
)
grid_en.fit(X, y)

print("Best Elastic Net params:", grid_en.best_params_)
print("Best CV R2:", grid_en.best_score_)

Best Elastic Net params: {'en__alpha': 0.1, 'en__l1_ratio': 0.1}
Best CV R2: 0.1253211997032629


There are no errors

In [None]:
best_en = grid_en.best_estimator_
os.makedirs("../models", exist_ok=True)
pickle.dump(best_en, open("../models/elasticnet_best.pkl", "wb"))

print("Elastic Net model saved successfully!")


Elastic Net model saved successfully!


All models trained and tuned. We can compare their performance now.

#**7. Models Comparison**


In [None]:
models = {
    "Decision Tree": best_dt,
    "Random Forest": best_rf,
    "Elastic Net": best_en
}

results = []
for name, model in models.items():
    preds = model.predict(X)
    r2 = r2_score(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    results.append({"Model": name, "R2": r2, "RMSE": rmse})

pd.DataFrame(results)


Unnamed: 0,Model,R2,RMSE
0,Decision Tree,0.185573,0.13893
1,Random Forest,0.317681,0.127164
2,Elastic Net,0.132821,0.143359


##**Conclusion**

Random Forest performs the best, it explains about 31.8% of the variance (R²) and has the lowest prediction error (RMSE = 0.127).

Decision Tree works reasonably but is clearly weaker than Random Forest.

Elastic Net gives the lowest R² and the highest RMSE, so it performs worst on this dataset.