Evaluando data sin feature engienering

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/semiprocessed_data.csv')

In [3]:
df.head()

Unnamed: 0,Price,Year,Mileage,Make,Model
0,8995,2014,0.012506,1,1194
1,10888,2013,0.006863,1,1193
2,8995,2013,0.017102,1,1194
3,10999,2014,0.013976,1,1193
4,14799,2016,0.007751,1,1196


Hacemos un modelo de regresion Lineal

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

X = df.drop('Price', axis=1)
y = df['Price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)


y_pred_lr = lr_model.predict(X_test)


mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("MSE:", mse_lr)
print("MAE:", mae_lr)
print("R2:",r2_lr)

MSE: 145983497.76789597
MAE: 7661.274870299753
R2: 0.20599193912569602


Exportamos el Modelo

In [7]:
import joblib


joblib.dump(lr_model, 'linear_regresion_model.pkl')

['linear_regresion_model.pkl']

Hacemos ahora un modelo de Decision Tree Regressor

In [8]:
tm_model = DecisionTreeRegressor(random_state=42)
tm_model.fit(X_train, y_train)

y_pred_tm = tm_model.predict(X_test)

mse_tm = mean_squared_error(y_test, y_pred_tm)
mae_tm = mean_absolute_error(y_test, y_pred_tm)
r2_tm = r2_score(y_test, y_pred_tm)

print("MSE:", mse_tm)
print("MAE:", mae_tm)
print("R2 Score:", r2_tm)

MSE: 34658418.9380901
MAE: 3043.860891107971
R2 Score: 0.8114919533045053


Exportamos el Modelo

In [6]:
import joblib


joblib.dump(tm_model, 'decision_tree_model.pkl')


['decision_tree_model.pkl']

Agregamos hiperarámetros con Grid Search al modelo de Decision Tree para mejorar su rendimiento

In [9]:
from sklearn.model_selection import GridSearchCV

# Definir los hiperparámetros a probar
param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}

# Crear el modelo base
base_model = DecisionTreeRegressor(random_state=42)

# Configurar el GridSearch
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,  # Validación cruzada con 3 divisiones
    verbose=2,
    n_jobs=-1
)

# Ajustar el modelo con los datos
grid_search.fit(X_train, y_train)

# Mostrar los mejores parámetros
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)

# Usar el mejor modelo
best_model = grid_search.best_estimator_

# Evaluar el modelo optimizado
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE del modelo optimizado: {mse}")
print(f"MAE del modelo optimizado: {mae}")
print(f"R2 del modelo optimizado: {r2}")


Fitting 3 folds for each of 135 candidates, totalling 405 fits
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.5s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.5s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.6s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=20; total time=   0.5s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.5s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.6s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=20; total time=   0.5s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.6s
[CV] END max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=10; total time=   0.4s
[CV] END max_depth=5, 

Exportamos el Modelo

In [10]:
import joblib

# Guarda el modelo optimizado
joblib.dump(best_model, 'decision_tree_optimized.pkl')


['decision_tree_optimized.pkl']

Hacemos un Modelo Random Forest

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Dividir los datos en conjuntos de entrenamiento y prueba
X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo Random Forest
rf_model = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=20)
rf_model.fit(X_train, y_train)

# Realizar predicciones
y_pred_rf = rf_model.predict(X_test)

# Evaluar el modelo
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("MSE del modelo Random Forest:", mse_rf)
print("MAE del modelo Random Forest:", mae_rf)
print("R2 del modelo Random Forest:", r2_rf)


MSE del modelo Random Forest: 19994500.73442563
MAE del modelo Random Forest: 2354.223406438945
R2 del modelo Random Forest: 0.8912493877798942


Exportamos el modelo Random Forest

In [8]:
import joblib

# Guarda el modelo optimizado
joblib.dump(rf_model, 'random_forest_model.pkl')

['random_forest_model.pkl']

Vamos a hacer un modelo SVR con un sample del DF ya que es muy pesado y pudiera tomar mucho tiempo en dar resultados

In [8]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [6]:
df_sample = df.sample(frac=0.1, random_state=42)  # Usa solo el 10% del dataset
X_sample = df_sample.drop('Price', axis=1)
y_sample = df_sample['Price']

X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)


In [10]:
# Usar una muestra del dataset
df_sample = df.sample(frac=0.1, random_state=42)
X_sample = df_sample.drop('Price', axis=1)
y_sample = df_sample['Price']

# Dividir datos
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

# Crear y entrenar el modelo
svr_model = SVR(kernel='linear', C=1.0, epsilon=0.2)  # Ajusta los parámetros
svr_model.fit(X_train_sample, y_train_sample)

# Predicciones y métricas
y_pred_svr = svr_model.predict(X_test_sample)
mse_svr = mean_squared_error(y_test_sample, y_pred_svr)
mae_svr = mean_absolute_error(y_test_sample, y_pred_svr)
r2_svr = r2_score(y_test_sample, y_pred_svr)

print("MSE del modelo SVR:", mse_svr)
print("MAE del modelo SVR:", mae_svr)
print("R2 del modelo SVR:", r2_svr)


MSE del modelo SVR: 162822816.2075455
MAE del modelo SVR: 7542.502757403379
R2 del modelo SVR: 0.12330095652824247
