# Exploración de Modelos

Importamos lo necesario

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR as SupportVectorRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
%xmode minimal

Exception reporting mode: Minimal


Abrimos el archivo generado en la etapa EDA

In [None]:
df = pd.read_parquet('steam_data_model.parquet')

Definimos X e y

In [None]:
X = df.drop(['release_date', 'price', 'developer'], axis=1)
y = df['price']

Dividimos en test y train

In [None]:
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Escalamos los datos (había probado sin escalar y la mejora fue marginal, pero mejora al fin)

In [None]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train_unscaled)
X_test = sc_X.transform(X_test_unscaled)

### Modelo Lineal

Instanciamos el modelo y ajustamos el set de entrenamiento

In [None]:
lineal = LinearRegression()
lineal.fit(X_train, y_train)

Observamos MSE, RMSE y R2

In [None]:
y_train_pred = lineal.predict(X_train)
y_test_pred = lineal.predict(X_test)

mse_train_lineal = mean_squared_error(y_train, y_train_pred, squared = True)
mse_test_lineal = mean_squared_error(y_test, y_test_pred, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_lineal}')
print(f'El error cuadrático medio en Test: {mse_test_lineal}')
print()
rmse_train_lineal = np.sqrt(mse_train_lineal)
rmse_test_lineal = np.sqrt(mse_test_lineal)
print(f'Raíz del error cuadrático medio en Train: {rmse_train_lineal}')
print(f'Raíz del error cuadrático medio en Test: {rmse_test_lineal}')
print()
r2_lineal = r2_score(y_test, y_test_pred)
print('El coeficiente de determinación del regresor es:', r2_lineal)

Observo la validación cruzada

In [None]:
scores = cross_val_score(lineal, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average MSE:", positive_scores.mean())

In [None]:
scores = cross_val_score(lineal, X_test, y_test, cv=10, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average MSE:", positive_scores.mean())

In [None]:
plt.figure(figsize=(17, 12))

sns.scatterplot(x=y_train, y=y_train_pred, label='Predicciones Train', color='magenta', alpha=0.2)

sns.regplot(x=y_train, y=y_train_pred, label='Regresión Lineal Train', scatter=False, color='cyan', line_kws={'linestyle': '--', 'alpha': 0.7})


plt.xlabel('Y Real')
plt.ylabel('Y Predicha')
plt.title('Valores predichos vs. reales y línea de regresión en Train')

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(17, 12))

sns.scatterplot(x=y_test, y=y_test_pred, label='Predicciones Test', color='magenta', alpha=0.2)

sns.regplot(x=y_test, y=y_test_pred, label='Regresión Lineal Test', scatter=False, color='cyan', line_kws={'linestyle': '--', 'alpha': 0.7})

plt.xlabel('Y Real')
plt.ylabel('Y Predicha')
plt.title('Valores predichos vs. reales y línea de regresión en Test')

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(17, 12))
sns.residplot(x=y_test_pred, y=y_test_pred - y_test, lowess=True, color='magenta',
              scatter_kws={'alpha': 0.2}, line_kws={'color': 'cyan', 'linestyle': '--', 'alpha': 0.7})
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
plt.figure(figsize=(17, 12))
sns.histplot(y_test_pred - y_test, bins=20, kde=True, color='purple')
plt.title('Regresión Lineal: Distribución de Error')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.show()

En todos los modelos subsiguientes repetimos los pasos de instanciamiento, ajuste, y observación de métricas para evaluar el rendimiento de los mismos; así como gráficos mostrando el rendimiento y otras características del modelo.

### Modelo Lineal con características polinomiales

In [None]:
# Para este modelos vamos a aplicar dos pre procesamientos, por lo que vamos a instanciar StandardScaler nuevamente.

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_unscaled)
X_test_poly = poly.transform(X_test_unscaled)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

lineal_poly = LinearRegression()
lineal_poly.fit(X_train_scaled, y_train)

Probé con 2, 3, 4. El elegido fue 2, ya en 3 los valores no tenían lógica (y 4 nunca terminó de ejecutar)

In [None]:
y_poly_train_pred = lineal_poly.predict(X_train_scaled)
y_poly_test_pred = lineal_poly.predict(X_test_scaled)

mse_train_lineal_poly = mean_squared_error(y_train, y_poly_train_pred, squared = True)
mse_test_lineal_poly = mean_squared_error(y_test, y_poly_test_pred, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_lineal_poly}')
print(f'El error cuadrático medio en Test: {mse_test_lineal_poly}')
print()
rmse_train_lineal_poly = np.sqrt(mse_train_lineal_poly)
rmse_test_lineal_poly = np.sqrt(mse_test_lineal_poly)
print(f'Raíz del error cuadrático medio en Train: {rmse_train_lineal_poly}')
print(f'Raíz del error cuadrático medio en Test: {rmse_test_lineal_poly}')
print()
r2_lineal_poly = r2_score(y_test, y_poly_test_pred)
print('El coeficiente de determinación del regresor es:', r2_lineal_poly)

Observo la cross validation

In [None]:
scores = cross_val_score(lineal_poly, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average MSE:", positive_scores.mean())

In [None]:
scores = cross_val_score(lineal_poly, X_test, y_test, cv=10, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average MSE:", positive_scores.mean())

In [None]:
plt.figure(figsize=(17, 12))

sns.scatterplot(x=y_train, y=y_poly_train_pred, label='Predicciones Train', color='magenta', alpha=0.2)

sns.regplot(x=y_train, y=y_poly_train_pred, label='Regresión Lineal Train', scatter=False, color='cyan', line_kws={'linestyle': '--', 'alpha': 0.7})

plt.xlabel('Y Real')
plt.ylabel('Y Predicha')
plt.title('Valores predichos vs. reales y línea de regresión en train')

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(17, 12))

sns.scatterplot(x=y_test, y=y_poly_test_pred, label='Predicciones Test', color='magenta', alpha=0.2)

sns.regplot(x=y_test, y=y_poly_test_pred, label='Regresión Lineal Test', scatter=False, color='cyan', line_kws={'linestyle': '--', 'alpha': 0.7})

plt.xlabel('Y Real')
plt.ylabel('Y Predicha')
plt.title('Valores predichos vs. reales y línea de regresión en test')

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(17, 12))
sns.residplot(x=y_poly_test_pred, y=y_poly_test_pred - y_test, lowess=True, color='magenta',
              scatter_kws={'alpha': 0.2}, line_kws={'color': 'cyan', 'linestyle': '--', 'alpha': 0.7})
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
plt.figure(figsize=(17, 12))
sns.histplot(y_poly_test_pred - y_test, bins=20, kde=True, color='purple')
plt.title('Regresión Polinómica: Distribución de Error')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.show()

### Modelo de Árbol (Tree)

Exploramos profundidades

In [None]:
tree_train_scores_mean = []
tree_train_scores_std = []
tree_test_scores_mean = []
tree_test_scores_std = []

profundidades = np.arange(1,10,1)
i = 0
i_max = len(profundidades)
for profundidad in profundidades:
    i = i + 1
    reg = DecisionTreeRegressor(max_depth=profundidad)
    tree_scores = cross_validate(reg, X, y, cv=5, return_train_score=True, n_jobs = -1)
    
    tree_train_scores_mean.append(tree_scores['train_score'].mean())
    tree_train_scores_std.append(tree_scores['train_score'].std())
    
    tree_test_scores_mean.append(tree_scores['test_score'].mean())
    tree_test_scores_std.append(tree_scores['test_score'].std())
    
tree_train_scores_mean = np.array(tree_train_scores_mean)
tree_train_scores_std = np.array(tree_train_scores_std)
tree_test_scores_mean = np.array(tree_test_scores_mean)
tree_test_scores_std = np.array(tree_test_scores_std)

Graficamos

In [None]:
plt.figure(figsize=(17, 12))

sns.lineplot(x=profundidades, y=tree_train_scores_mean, marker='o', color='magenta', label='Training score')
plt.fill_between(profundidades, tree_train_scores_mean - tree_train_scores_std,
                 tree_train_scores_mean + tree_train_scores_std, alpha=0.1, color="magenta")

sns.lineplot(x=profundidades, y=tree_test_scores_mean, marker='o', color='cyan', label='Test score')
plt.fill_between(profundidades, tree_test_scores_mean - tree_test_scores_std,
                 tree_test_scores_mean + tree_test_scores_std, alpha=0.1, color="cyan")

plt.legend()
plt.ylabel('Score')
plt.xlabel('Profundidad Árbol Decisión')
plt.title('Scores en Test y Train')
plt.grid(True)
plt.show()

Nos quedamos con 4, que es donde tenemos mejor rendimiento en test antes de que test y train comiencen a alejarse

In [None]:
tree = DecisionTreeRegressor(max_depth=4)

tree.fit(X_train, y_train)

In [None]:
y_train_pred_tree = tree.predict(X_train)
y_test_pred_tree = tree.predict(X_test)

mse_train_tree = mean_squared_error(y_train, y_train_pred_tree, squared = True)
mse_test_tree = mean_squared_error(y_test, y_test_pred_tree, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_tree}')
print(f'El error cuadrático medio en Test: {mse_test_tree}')
print()
rmse_train_tree = np.sqrt(mse_train_tree)
rmse_test_tree = np.sqrt(mse_test_tree)
print(f'Raíz del error cuadrático medio en Train: {rmse_train_tree}')
print(f'Raíz del error cuadrático medio en Test: {rmse_test_tree}')
print()
r2_tree = r2_score(y_test, y_test_pred_tree)
print('El coeficiente de determinación del regresor es:', r2_tree)

In [None]:
scores = cross_val_score(tree, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average RMSE:", positive_scores.mean())

In [None]:
scores = cross_val_score(tree, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average RMSE:", positive_scores.mean())

In [None]:
# Creamos un DataFrame para plottear
pred_vs_actual_tree = pd.DataFrame({'Real': y_test, 'Predicho': y_test_pred_tree})

plt.figure(figsize=(17, 12))
sns.scatterplot(x='Real', y='Predicho', data=pred_vs_actual_tree, color='magenta', alpha=0.2)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='cyan', linestyle='--')
plt.title('Predicho vs. Real (Decision Tree Regressor)')
plt.xlabel('Real')
plt.ylabel('Predicho')
plt.show()

Observamos los valores con un residual plot

In [None]:
# Calculamos residuales
residuals_tree = y_test - y_test_pred_tree

# Armamos dataframe
residuals_df_tree = pd.DataFrame({'Residuals': residuals_tree})

# Plotteamos
plt.figure(figsize=(17, 12))
sns.scatterplot(x=y_test_pred_tree, y=residuals_tree, color='magenta', alpha=0.2)
plt.axhline(y=0, color='cyan', linestyle='--')
plt.title('Residual Plot (Decision Tree Regressor)')
plt.xlabel('Predicho')
plt.ylabel('Residuales')
plt.show()

### Modelo K-Vecinos

Observamos al curva de aprendizaje

In [None]:
k_values = [1] + list(range(5, 101, 5))
train_errors = []
test_errors = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_train_pred = knn.predict(X_train)
    y_test_pred = knn.predict(X_test)
    train_errors.append(mean_squared_error(y_train, y_train_pred))
    test_errors.append(mean_squared_error(y_test, y_test_pred))

In [None]:
plt.figure(figsize=(17, 12))
sns.lineplot(x=k_values, y=train_errors, marker='o', color='magenta', label='Train')
sns.lineplot(x=k_values, y=test_errors, marker='o', color='cyan', label='Test')
plt.title('Learning Curve - KNeighborsRegressor')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.show()

Elegimos 50, ya que en ese punto la distacia entre test y train es la menor, antes de que comience a subir el MSE.

In [None]:
neighbors =  KNeighborsRegressor(n_neighbors=50)

neighbors.fit(X_train, y_train)

In [None]:
y_train_pred_nei = neighbors.predict(X_train)
y_test_pred_nei = neighbors.predict(X_test)

mse_train_nei = mean_squared_error(y_train, y_train_pred_nei, squared = True)
mse_test_nei = mean_squared_error(y_test, y_test_pred_nei, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_nei}')
print(f'El error cuadrático medio en Test: {mse_test_nei}')
print()
rmse_train_nei = np.sqrt(mse_train_nei)
rmse_test_nei = np.sqrt(mse_test_nei)
print(rmse_train_nei)
print(rmse_test_nei)
print()
r2_nei = r2_score(y_test, y_test_pred_nei)
print('El coeficiente de determinación del regresor es:', r2_nei)

In [None]:
# Scatter plot for Predicted vs. Actual values (Train)
plt.figure(figsize=(17, 12))
sns.scatterplot(x=y_train_pred_nei, y=y_train, color='magenta', alpha=0.2)
plt.title('Predicho vs. Real K-Vecinos en Train')
plt.xlabel('Real')
plt.ylabel('Predicho')
plt.show()

# Scatter plot for Predicted vs. Actual values (Test)
plt.figure(figsize=(17, 12))
sns.scatterplot(x=y_test_pred_nei, y=y_test, color='purple', alpha=0.2)
plt.title('Predicho vs. Real K-Vecinos en Test')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()


# Residuales
residuals = y_test - y_test_pred_nei
plt.figure(figsize=(17, 12))
sns.scatterplot(x=y_test_pred_nei, y=residuals, color='orange', alpha=0.2)
plt.axhline(y=0, color='cyan', linestyle='--')
plt.title('Residual Plot K-Vecinos')
plt.xlabel('Predicho')
plt.ylabel('Residuales')
plt.show()

### Modelo Random Forest

Exploramos diferentes números de estimadores

In [None]:
n_estimators = [1] + list(range(5, 200, 5))
train_errors_forest = []
test_errors_forest = []

for n in n_estimators:
    forest = RandomForestRegressor(n_estimators=n, random_state=42)
    forest.fit(X_train, y_train)
    y_train_pred_forest = forest.predict(X_train)
    y_test_pred_forest = forest.predict(X_test)
    train_errors_forest.append(mean_squared_error(y_train, y_train_pred_forest))
    test_errors_forest.append(mean_squared_error(y_test, y_test_pred_forest))

plt.figure(figsize=(17, 12))
sns.lineplot(x=n_estimators, y=train_errors_forest, marker='o', color='magenta', label='Train')
sns.lineplot(x=n_estimators, y=test_errors_forest, marker='o', color='cyan', label='Test')
plt.title('Random Forest Regressor Learning Curve')
plt.xlabel('Number of Estimators')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.show()

El MSE se estabiliza alrededor de 50

In [None]:
forest = RandomForestRegressor(n_estimators=50, random_state=42)

forest.fit(X_train, y_train)

In [None]:
y_train_pred_forest = forest.predict(X_train)
y_test_pred_forest = forest.predict(X_test)

mse_train_forest = mean_squared_error(y_train, y_train_pred_forest, squared = True)
mse_test_forest = mean_squared_error(y_test, y_test_pred_forest, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_forest}')
print(f'El error cuadrático medio en Test: {mse_test_forest}')
print()
rmse_train_forest = np.sqrt(mse_train_forest)
rmse_test_forest = np.sqrt(mse_test_forest)
print(rmse_train_forest)
print(rmse_test_forest)
print()
r2_forest = r2_score(y_test, y_test_pred_forest)
print('El coeficiente de determinación del regresor es:', r2_forest)

In [None]:
plt.figure(figsize=(17, 12))
sns.scatterplot(x=y_train, y=y_train_pred_forest, color='magenta', alpha=0.2)
plt.title('Random Forest Regressor: Predicted vs. Actual Train')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
plt.figure(figsize=(17, 12))
sns.scatterplot(x=y_test, y=y_test_pred_forest, color='cyan', alpha=0.2)
plt.title('Random Forest Regressor: Predicted vs. Actual Test')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
plt.figure(figsize=(17, 12))
sns.residplot(x=y_test_pred_forest, y=y_test_pred_forest - y_test, lowess=True, color='magenta',
              scatter_kws={'alpha': 0.5}, line_kws={'color': 'cyan'})
plt.title('Random Forest Regressor: Residual Plot')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()

### Modelo Support Vector

Elegimos el valor de C para la variante lineal

In [None]:
vector_test = SupportVectorRegressor(kernel='linear')

search_space = {
    'C': [0.01, 0.1, 1],
}

rsearch = GridSearchCV(
    vector_test, search_space,
    n_jobs=-1, scoring='accuracy', cv=3, verbose=0)


rsearch.fit(X_train,y_train)

In [None]:
rsearch.best_params_

In [None]:
vector_linear = SupportVectorRegressor(kernel='linear', C=0.01)

vector_linear.fit(X_train, y_train)

In [None]:
y_train_pred_vector_linear = vector_linear.predict(X_train)
y_test_pred_vector_linear = vector_linear.predict(X_test)

mse_train_vector_linear = mean_squared_error(y_train, y_train_pred_vector_linear, squared = True)
mse_test_vector_linear = mean_squared_error(y_test, y_test_pred_vector_linear, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_vector_linear}')
print(f'El error cuadrático medio en Test: {mse_test_vector_linear}')
print()
rmse_train_vector_linear = np.sqrt(mse_train_vector_linear)
rmse_test_vector_linear = np.sqrt(mse_test_vector_linear)
print(rmse_train_vector_linear)
print(rmse_test_vector_linear)
print()
r2_vector_linear = r2_score(y_test, y_test_pred_vector_linear)
print('El coeficiente de determinación del regresor es:', r2_vector_linear)

In [None]:
scores = cross_val_score(vector_linear, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average RMSE:", positive_scores.mean())

In [None]:
scores = cross_val_score(vector_linear, X_test, y_test, cv=10, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average RMSE:", positive_scores.mean())

In [None]:
plt.figure(figsize=(17, 12))
sns.scatterplot(x=y_train_pred_vector_linear, y=y_train, color='magenta', alpha=0.2)
plt.title('Support Vector Regression (Linear Kernel): Predicted vs. Actual Train')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

In [None]:
plt.figure(figsize=(17, 12))
sns.scatterplot(x=y_test_pred_vector_linear, y=y_test, color='cyan', alpha=0.2)
plt.title('Support Vector Regression (Linear Kernel): Predicted vs. Actual Test')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

In [None]:
plt.figure(figsize=(17, 12))
sns.residplot(x=y_test_pred_vector_linear, y=y_test_pred_vector_linear - y_test, lowess=True, color='magenta',
              scatter_kws={'alpha': 0.2}, line_kws={'color': 'cyan'})
plt.title('Support Vector Regression (Linear Kernel): Residual Plot')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()

Probamos la variante polinomial

In [None]:
vector_poly = SupportVectorRegressor(kernel='poly', C=0.1, epsilon=0.1)

vector_poly.fit(X_train, y_train)

In [None]:
y_train_pred_vector_poly = vector_poly.predict(X_train)
y_test_pred_vector_poly = vector_poly.predict(X_test)

mse_train_vector_poly = mean_squared_error(y_train, y_train_pred_vector_poly, squared = True)
mse_test_vector_poly = mean_squared_error(y_test, y_test_pred_vector_poly, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_vector_poly}')
print(f'El error cuadrático medio en Test: {mse_test_vector_poly}')
print()
rmse_train_vector_poly = np.sqrt(mse_train_vector_poly)
rmse_test_vector_poly = np.sqrt(mse_test_vector_poly)
print(f'Raíz del error cuadrático medio en Train: {rmse_train_vector_poly}')
print(f'Raíz del error cuadrático medio en Test: {rmse_test_vector_poly}')
print()
r2_vector_poly = r2_score(y_test, y_test_pred_vector_poly)
print('El coeficiente de determinación del regresor es:', r2_vector_poly)

Probamos con radial basis function

In [None]:
vector_rbf = SupportVectorRegressor(kernel='rbf', C=0.1, epsilon=0.1)

vector_rbf.fit(X_train, y_train)

In [None]:
y_train_pred_vector_rbf = vector_rbf.predict(X_train)
y_test_pred_vector_rbf = vector_rbf.predict(X_test)

mse_train_vector_rbf = mean_squared_error(y_train, y_train_pred_vector_rbf, squared = True)
mse_test_vector_rbf = mean_squared_error(y_test, y_test_pred_vector_rbf, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_vector_rbf}')
print(f'El error cuadrático medio en Test: {mse_test_vector_rbf}')
print()
rmse_train_vector_rbf = np.sqrt(mse_train_vector_rbf)
rmse_test_vector_rbf = np.sqrt(mse_test_vector_rbf)
print(f'Raíz del error cuadrático medio en Train: {rmse_train_vector_rbf}')
print(f'Raíz del error cuadrático medio en Test: {rmse_test_vector_rbf}')
print()
r2_vector_rbf = r2_score(y_test, y_test_pred_vector_rbf)
print('El coeficiente de determinación del regresor es:', r2_vector_rbf)

### Ensamble: Stacking con 3 modelos de mejor performance

In [None]:
# Elegí los 3 modelos con menor error
estimator_list = [
    ('lineal', lineal),
    ('lineal_poly',lineal_poly),
    ('vector_lineal',vector_linear)]

# Aplico el stacking con el meta modelo lineal
stack_model = StackingRegressor(
    estimators=estimator_list, final_estimator=LinearRegression())

stack_model.fit(X_train, y_train)

y_train_pred_stack = stack_model.predict(X_train)
y_test_pred_stack = stack_model.predict(X_test)

mse_train_stack = mean_squared_error(y_train, y_train_pred_stack, squared = True)
mse_test_stack = mean_squared_error(y_test, y_test_pred_stack, squared = True)
print(f'El error cuadrático medio en Train: {mse_train_stack}')
print(f'El error cuadrático medio en Test: {mse_test_stack}')

print()
rmse_train_stack = np.sqrt(mse_train_stack)
rmse_test_stack = np.sqrt(mse_test_stack)
print(f'Raíz del error cuadrático medio en Train: {rmse_train_stack}')
print(f'Raíz del error cuadrático medio en Test: {rmse_test_stack}')
print()
r2_stack = r2_score(y_test, y_test_pred_stack)
print('El coeficiente de determinación del regresor es:', r2_stack)

In [None]:
scores = cross_val_score(stack_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average MSE:", positive_scores.mean())

In [None]:
scores = cross_val_score(stack_model, X_test, y_test, cv=5, scoring='neg_mean_squared_error')

positive_scores = -scores

print("Cross-Validation Scores:", positive_scores)
print("Average MSE:", positive_scores.mean())

### Pruebas de predicciones y procesamiento de input

In [None]:
df.info()

Recopilamos los nombres de los géneros, y armamos un diccionario, iniciando los valores en 0.

In [None]:
df_columns = df.columns[6:26]
column_dict = {col: 0 for col in df_columns}

Simulamos un input un usuario de nuestra API de predicción

In [None]:
genres = ['AdVenTure', 'AcTiON']

Asignamos 1 como valor a los géneros presentes en la lista, case insensitively (sin importar qué combinación de mayúsculas y minúsculas se haya usado en el input)

In [None]:
for genre in genres:
    if genre.lower().capitalize() in column_dict:
        column_dict[genre.lower().capitalize()] = 1

Observamos el resultado

In [None]:
column_dict

Simulamos un input de fecha, en formato YYYY-MM-DD string

In [None]:
release_date = "2012-09-07"
type(release_date)

Usamos pandas para que coincida el tipo con el que está en el dataframe.

In [None]:
release_date = pd.to_datetime(release_date)
type(release_date)

Calculamos la fecha máxima del dataframe

In [None]:
max_date = df['release_date'].max()
print(max_date)
type(max_date)

Calculamos el delta entre la fecha máxima y la ingresada y redondeamos a entero, obteniendo la edad del release en meses

In [None]:
release_age_months = (max_date - release_date) // pd.Timedelta(days=30)

In [None]:
release_age_months

Simulamos un input de developer

In [None]:
developer = 'Rocksteady Studios,Feral Interactive (Mac)'

In [None]:
X.info()

Obtenemos, según el input de developer, los valores developer_total y avg_dev

In [None]:
specific_developer_rows = df[df['developer'] == developer]

developer_total = specific_developer_rows['developer_total'].iloc[0]
avg_dev = specific_developer_rows['avg_dev'].iloc[0]

print(developer_total, avg_dev)

Armamos una primer lista

In [None]:
lista_1 = [release_age_months, developer_total, avg_dev]

Segunda lista, con los valores del diccionario de géneros convertidos a lista (respetando el orden original del dataframe y modelo).

In [None]:
lista_2 = list(column_dict.values())

Convertimos las listas en numpy arrays/vectores

In [None]:
array1 = np.array(lista_1)
array2 = np.array(lista_2)

Concatenamos las listas para obtener la X para las predicciones

In [None]:
X_new = np.concatenate((array1, array2))

Aplicamos el mismo escalado que tiene el modelo

In [None]:
X_new_scaled = sc_X.transform([X_new])

Aplicamos las características polinómicas para el modelo que las utiliza

In [None]:
X_new_poly = poly.transform([X_new])
X_new_poly_scaled = scaler.transform(X_new_poly)

Usamos el modelo de Árbol

In [None]:
chosen_model = tree

Predecimos

In [None]:
y_pred = chosen_model.predict(X_new_scaled)
y_pred[0]

Modelo de Ensamble Stacked

In [None]:
chosen_model = stack_model

In [None]:
y_pred = chosen_model.predict(X_new_scaled)
y_pred[0]

Usamos el modelo con características polinomiales (y usamos la X acondicionada para este)

In [None]:
chosen_model = lineal_poly

Predecimos

In [None]:
y_pred = chosen_model.predict(X_new_poly_scaled)
y_pred[0]

Tomamos el modelo SVR con kernel lineal

In [None]:
chosen_model = vector_linear

Predecimos

In [None]:
y_pred = chosen_model.predict(X_new_scaled)
y_pred[0]

Usamos el modelo clásico lineal

In [None]:
chosen_model = lineal

Predecimos

In [None]:
y_pred = chosen_model.predict(X_new_scaled)
y_pred[0]

Guardamos los pre procesamientos en un diccionario

In [None]:
preprocessing_steps = {
    'scaler': sc_X
}

Los almacenamos en un archivo de pickle para posterior uso

In [None]:
with open('preprocessing_steps.pkl', 'wb') as file:
    pickle.dump(preprocessing_steps, file)

Guardamos el RMSE del SVR con kernel lineal en testeo en un archivo

In [None]:
with open('rmse_model.txt', 'w') as file:
    file.write(str(rmse_test_lineal))

Guardamos el modelo elegido en un archivo de pickle.

In [None]:
with open('trained_model.pkl', 'wb') as file:
    pickle.dump(chosen_model, file)

Todos los modelos con base lineal tuvieron performances similares, por encima de los modelos no basados en el lineal. Por simplicidad del modelo, y un menor peso de archivo, elegimos el modelo clásico lineal.

Probamos los archivos

In [None]:
with open('trained_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [None]:
y_pred_loaded = loaded_model.predict(X_new_scaled)

In [None]:
y_pred_loaded[0]

In [None]:
with open('rmse_model.txt', 'r') as file:
    rmse_retrieved = file.read()

In [None]:
float(rmse_retrieved)