In [1]:
import warnings
warnings.filterwarnings('ignore')

## train y test split

In [2]:
# Separa en DataSet en un set de entrenamiento y un set de testing
from sklearn.model_selection import train_test_split

In [5]:
import numpy as np

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

In [6]:
# Separamos los datos en train y test
# Por default, se separa un 75% para entrenar y un 25% para testear
X_train, X_test, y_train, y_test = train_test_split(X,y) # test_size=0.5

In [8]:
# Tamaño del set
X_train.shape

(75, 1)

## Eligiendo un modelo

In [9]:
# Desde scikit learn
# Regresión Lineal
from sklearn.linear_model import LinearRegression 
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


LinearRegression()

In [10]:
# Gradiente descendiente
from sklearn.linear_model import SGDRegressor 
sgd_reg = SGDRegressor()
sgd_reg.fit(X_train, y_train)

SGDRegressor()

In [11]:
from sklearn.tree import DecisionTreeRegressor 
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [12]:
# Calculamos el rmse en el conjunto de entrenamiento 
lin_reg_predict = lin_reg.predict(X_train)
sgd_reg_predict = sgd_reg.predict(X_train)
tree_reg_predict = tree_reg.predict(X_train)

In [14]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_train, lin_reg_predict)
sgd_mse = mean_squared_error(y_train, sgd_reg_predict)
tree_mse = mean_squared_error(y_train, tree_reg_predict)

In [15]:
print("RMSE Entrenamiento: ", np.sqrt(lin_mse), np.sqrt(sgd_mse), np.sqrt(tree_mse))

RMSE Entrenamiento:  1.034159499065231 1.0604620782048804 0.0


In [16]:
# Calculamos el rmse en el conjunto de test 
lin_reg_predict = lin_reg.predict(X_test)
sgd_reg_predict = sgd_reg.predict(X_test)
tree_reg_predict = tree_reg.predict(X_test)

In [17]:
lin_mse = mean_squared_error(y_test, lin_reg_predict)
sgd_mse = mean_squared_error(y_test, sgd_reg_predict)
tree_mse = mean_squared_error(y_test, tree_reg_predict)

In [18]:
print("RMSE Test: ", np.sqrt(lin_mse), np.sqrt(sgd_mse), np.sqrt(tree_mse))

RMSE Test:  1.0071702221435936 1.0209165879177269 0.9842279276354351


## Cross Validation

In [19]:
from sklearn.model_selection import cross_val_score
# Indicamos que queremos hacer cross validation con 10 cortes. 
# Es decir vamos a iterar 10 veces y obtener los scores de cada iteración
scores = cross_val_score(tree_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)

In [20]:
rmse_scores = np.sqrt(-scores)

In [21]:
print("Scores: ", rmse_scores)
print("Promedio: ", rmse_scores.mean())
print("Desvío estandar: ", rmse_scores.std())

Scores:  [1.47750085 1.15524289 1.54587769 1.02659546 1.38524952 1.23765475
 1.1084716  1.29621445 1.25056447 1.57944124]
Promedio:  1.3062812900214367
Desvío estandar:  0.17766700800590873


In [22]:
scores = cross_val_score(lin_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)


In [23]:
print("Scores: ", rmse_scores)
print("Promedio: ", rmse_scores.mean())
print("Desvío estandar: ", rmse_scores.std())

Scores:  [0.90626351 0.94773761 1.02080725 1.10501338 0.67361137 1.30726316
 0.90960744 1.26872849 0.79393373 1.50613855]
Promedio:  1.0439104488011495
Desvío estandar:  0.2415911630923036
