# Random Forest Algorithm

In [1]:
import pandas as pd
import numpy as np
import requests
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,explained_variance_score

Importamos el csv de train que hemos limpiado 

In [2]:
diamonds_train_dummies = pd.read_csv("output/diamonds_train_dummies.csv")
diamonds_train_dummies.head()

Unnamed: 0,carat,depth,table,x,y,z,price,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,1.12,61.6,59.0,6.67,6.63,4.1,5363,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1.14,60.0,54.0,6.74,6.97,4.11,5593,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.9,60.3,63.0,6.12,6.22,3.72,3534,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.71,61.9,54.0,5.74,5.76,3.56,3212,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0.34,60.0,62.0,4.51,4.55,2.72,447,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Establezco el Ground Truth separando las columnas y el precio

In [3]:
columnas = [a for a in diamonds_train_dummies.columns if a not in ["price"]]
X = diamonds_train_dummies[columnas]
y = diamonds_train_dummies["price"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32364, 26) (8091, 26) (32364,) (8091,)


## Random Forest Regressor

In [5]:
model = RandomForestRegressor(n_estimators=500,max_features="auto",max_depth=35,n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Métricas

In [6]:
print("R2_score",r2_score(y_test, y_pred))
print("Mean squared error",mean_squared_error(y_test, y_pred)**.5)
print("Mean absolute error",mean_absolute_error(y_test, y_pred))
print("Explained variance score",explained_variance_score(y_test, y_pred))

R2_score 0.979016956657013
Mean squared error 577.0783088314564
Mean absolute error 284.6491883473492
Explained variance score 0.979016995441068


In [7]:
scores = cross_val_score(model,X,y, cv=10)
print(np.mean(scores))

0.9805057229123673


### Entrenamos todo el modelo al 100%

In [8]:
model = RandomForestRegressor(n_estimators=400,max_features=7,max_depth=35)
model.fit(X, y)

RandomForestRegressor(max_depth=35, max_features=7, n_estimators=400)

### Predecir con el test

In [9]:
diamonds_test_dummies = pd.read_csv("output/diamonds_test_dummies.csv")
diamonds_train_dummies.head()

Unnamed: 0,carat,depth,table,x,y,z,price,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,1.12,61.6,59.0,6.67,6.63,4.1,5363,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1.14,60.0,54.0,6.74,6.97,4.11,5593,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.9,60.3,63.0,6.12,6.22,3.72,3534,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.71,61.9,54.0,5.74,5.76,3.56,3212,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0.34,60.0,62.0,4.51,4.55,2.72,447,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [10]:
model_test = model.predict(diamonds_test_dummies)

### Exporto csv para submit a Kaggle

In [11]:
RandomForest_test=pd.DataFrame(model_test).reset_index().rename(columns={"index":"id",0:"price"})

In [12]:
RandomForest_test.to_csv("output/RandomForest_test.csv",index=False)