In [9]:
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score


In [27]:
boston = pd.read_csv('Data/Boston.csv')
boston = boston.fillna(-1)
boston.drop('Unnamed: 0', axis=1, inplace=True)
boston = pd.get_dummies(boston, columns=['rad'])
target = 'medv'
X = boston.copy().drop(target, axis=1)
y = boston[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [28]:
pipeline = Pipeline([
    ('rf', RandomForestRegressor(n_estimators=100, n_jobs=-1))
])

params = {
    'rf__max_depth': [8, 10, 15, 20, 30, 40],
    'rf__max_features': ['log2', 'sqrt', 10, 12]
}

rf_grid = GridSearchCV(estimator=pipeline, n_jobs=-1, cv=5, scoring='r2', param_grid=params)
rf_grid.fit(X_train, y_train)

In [29]:
rf_grid.best_params_

{'rf__max_depth': 30, 'rf__max_features': 10}

In [30]:
best_model = rf_grid.best_estimator_.named_steps['rf']

In [31]:
predicciones_rf = best_model.predict(X_test)

r2_score(y_pred=predicciones_rf, y_true=y_test)

0.881985351175103

In [32]:
model_interpretability = pd.DataFrame({
    'variables': X_train.keys(),
    'importancia': best_model.feature_importances_
})
model_interpretability

Unnamed: 0,variables,importancia
0,crim,0.055681
1,zn,0.002773
2,indus,0.037423
3,chas,0.003379
4,nox,0.046034
5,rm,0.400705
6,age,0.025067
7,dis,0.049984
8,tax,0.02226
9,ptratio,0.0419
