In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline

In [4]:
df = pd.read_csv("../../data/precio/dfnorm.csv", index_col = 0)
df.head()

Unnamed: 0,edad,Valoración actual,Progresión,precio,Ataque,Regate,Aceleración,Potencia tiro,Agresividad,Defensa
0,1.8267,1.323359,-0.889897,3300000,0.002999,-2.067653,-1.428951,-2.402604,0.839337,1.32188
1,0.369777,1.323359,1.005344,3800000,0.457513,1.649663,1.313775,0.52964,-1.62619,-1.094921
2,-1.087145,1.323359,0.373597,3600000,0.457513,0.748495,0.690428,1.018347,0.441671,-0.231778
3,1.098239,1.323359,-0.25815,3500000,-0.724223,0.185265,-0.306927,1.507054,0.759804,-0.807207
4,1.8267,1.323359,-0.25815,3600000,0.366611,0.635849,-0.057588,1.311571,1.555135,-1.440179


In [5]:
df = df.select_dtypes(include=np.number)
df.head()

Unnamed: 0,edad,Valoración actual,Progresión,precio,Ataque,Regate,Aceleración,Potencia tiro,Agresividad,Defensa
0,1.8267,1.323359,-0.889897,3300000,0.002999,-2.067653,-1.428951,-2.402604,0.839337,1.32188
1,0.369777,1.323359,1.005344,3800000,0.457513,1.649663,1.313775,0.52964,-1.62619,-1.094921
2,-1.087145,1.323359,0.373597,3600000,0.457513,0.748495,0.690428,1.018347,0.441671,-0.231778
3,1.098239,1.323359,-0.25815,3500000,-0.724223,0.185265,-0.306927,1.507054,0.759804,-0.807207
4,1.8267,1.323359,-0.25815,3600000,0.366611,0.635849,-0.057588,1.311571,1.555135,-1.440179


In [6]:
df.describe().T.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
edad,394.0,-1.154181e-15,1.001271,-1.815607,-1.087145,-0.3586841,0.3697774,1.8267
Valoración actual,394.0,5.770905e-16,1.001271,-2.84274,-0.5994563,0.04148205,1.00289,1.323359
Progresión,394.0,-3.040997e-15,1.001271,-0.889897,-0.8898974,-0.2581504,0.3735966,3.532332
precio,394.0,2177030.0,926051.938525,525000.0,1500000.0,2050000.0,3100000.0,4099999.0
Ataque,394.0,2.524771e-16,1.001271,-2.724085,-0.6333205,0.184805,0.7302219,2.366473


In [7]:
y = df["precio"]
X = df.drop(columns=["precio"])

In [8]:
pipe = Pipeline(steps=[('estimator', LinearRegression())])

In [9]:
params_grid = [{
                'estimator':[LinearRegression()],
                },
                {
                'estimator': [DecisionTreeRegressor()],
                'estimator__max_depth': [4, 5, 6, 7],
                'estimator__max_features': ["sqrt", "log2"],
                'estimator__min_samples_split': [10, 50, 100], 
                'estimator__min_samples_leaf':[1,2,3],
                },
                {'estimator':[RandomForestRegressor()],
                 'estimator__n_estimators':[50,100,200,300],
                 'estimator__max_depth':[4, 5, 6, 7, 9], 
                 'estimator__min_samples_split':[2,3,4], 
                 'estimator__min_samples_leaf':[1,2,3],
                },
                {'estimator':[KNeighborsRegressor()],
                 'estimator__n_neighbors':[3,5,6,7, 8, 9, 10],
                }

              ]

In [10]:
grid = GridSearchCV(pipe, params_grid, n_jobs= -1, verbose = 3)

In [11]:
%%time
res = grid.fit(X, y)

Fitting 5 folds for each of 260 candidates, totalling 1300 fits
Wall time: 2min 11s


In [12]:
best_model = grid.best_params_
best_model

{'estimator': RandomForestRegressor(max_depth=9, min_samples_split=3),
 'estimator__max_depth': 9,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 3,
 'estimator__n_estimators': 100}

In [13]:
results = pd.DataFrame(grid.cv_results_).head()
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__max_depth,param_estimator__max_features,param_estimator__min_samples_leaf,param_estimator__min_samples_split,param_estimator__n_estimators,param_estimator__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006198,0.000747,0.003597,0.000489,LinearRegression(),,,,,,,{'estimator': LinearRegression()},-0.836455,0.801489,0.564084,-0.156332,-5.606011,-1.046645,2.350945,181
1,0.079183,0.03522,0.012489,0.006245,DecisionTreeRegressor(),4.0,sqrt,1.0,10.0,,,"{'estimator': DecisionTreeRegressor(), 'estima...",-8.401827,-3.223535,-3.13565,-3.957871,-7.007534,-5.145283,2.15475,210
2,0.003125,0.00625,0.0,0.0,DecisionTreeRegressor(),4.0,sqrt,1.0,50.0,,,"{'estimator': DecisionTreeRegressor(), 'estima...",-9.792303,-2.397281,-2.394925,-5.080203,-21.9965,-8.332243,7.34704,242
3,0.009375,0.007655,0.00625,0.007654,DecisionTreeRegressor(),4.0,sqrt,1.0,100.0,,,"{'estimator': DecisionTreeRegressor(), 'estima...",-22.866329,-2.09454,-3.722396,-3.939307,-10.778298,-8.680174,7.695129,247
4,0.003125,0.00625,0.00625,0.007654,DecisionTreeRegressor(),4.0,sqrt,2.0,10.0,,,"{'estimator': DecisionTreeRegressor(), 'estima...",-24.910127,0.228137,-4.266424,-3.989265,-10.508782,-8.689292,8.804528,248


In [15]:
results = cross_validate(RandomForestRegressor(max_depth=9, min_samples_split=3), 
               X=X, y=y, scoring='neg_root_mean_squared_error', cv=10, verbose = 3,  n_jobs= -1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.0s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.6s finished


In [16]:
rmse_ = abs(results["test_score"].mean())
rmse_

105138.69406916939