### Análisis Exploratorio 


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import statsmodels.api as sm


In [None]:
sns.set_style("whitegrid")
train_df = pd.read_csv("train.csv")

In [None]:
print("Información general del dataset:")
train_df.info()

print("\nResumen estadístico:")
print(train_df.describe())

In [None]:
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print("\nValores nulos por columna:")
print(missing_values)

In [None]:
# Llenar valores nulos en variables categóricas con "None"
categorical_features = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu",
                        "GarageType", "GarageFinish", "GarageQual", "GarageCond",
                        "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"]
# Llenar valores nulos en variables categóricas con "None"
train_df[categorical_features] = train_df[categorical_features].fillna("None")

numerical_features = ["LotFrontage", "GarageYrBlt", "MasVnrArea"]

# Llenar valores nulos en variables numéricas con la mediana
train_df[numerical_features] = train_df[numerical_features].apply(lambda x: x.fillna(x.median()))
    


In [None]:
# Aplicar logaritmo al precio de venta para normalizar
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])

# Convertir variables categóricas en numéricas con Label Encoding de forma vectorizada
label_encoders = {col: LabelEncoder().fit(train_df[col]) for col in categorical_features}
train_df[categorical_features] = train_df[categorical_features].apply(lambda col: label_encoders[col.name].transform(col))


In [None]:
#4. ANÁLISIS DE CORRELACIÓN
plt.figure(figsize=(10, 8))
corr_matrix = train_df.corr(numeric_only=True)
sns.heatmap(corr_matrix[['SalePrice']].sort_values(by='SalePrice', ascending=False), 
            annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlación de Variables con SalePrice")
plt.show()

In [None]:
correlation_threshold = 0.5
strong_corr_vars = corr_matrix["SalePrice"].abs().sort_values(ascending=False)
strong_corr_vars = strong_corr_vars[strong_corr_vars > correlation_threshold]
print("Variables con mayor correlación con SalePrice:")
print(strong_corr_vars)

In [None]:
# 5. DIVISIÓN DEL DATASET 
X = train_df.drop(columns=["SalePrice", "Id"])
y = train_df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Tamaño de los conjuntos de datos:")
print(f"Entrenamiento: {X_train.shape}, Prueba: {X_test.shape}")



In [None]:
categorical_features = X_train.select_dtypes(include=['object']).columns

column_transformer = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)],
    remainder='passthrough' 
)

X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)

X_train_transformed = pd.DataFrame(X_train_transformed, columns=column_transformer.get_feature_names_out())
X_test_transformed = pd.DataFrame(X_test_transformed, columns=column_transformer.get_feature_names_out())

print("Transformación de datos completada correctamente")

print("Preprocesamiento completado")

# Análisis de relaciones con la variable respuesta

### Análisis de Relación con Gráficos de Dispersión

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()  

top_vars = ["GrLivArea", "GarageArea", "TotalBsmtSF", "1stFlrSF", "OverallQual", "YearBuilt"]

for var, ax in zip(top_vars, axes):
    sns.scatterplot(x=train_df[var], y=train_df["SalePrice"], ax=ax)
    ax.set_title(f"{var} vs SalePrice")

plt.tight_layout()
plt.show()


### Análisis de Relación con Boxplots

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x=train_df["OverallQual"], y=train_df["SalePrice"])
plt.xlabel("Calidad de Construcción (OverallQual)")
plt.ylabel("Precio de Venta (USD)")
plt.title("Relación entre Calidad de Construcción y Precio de Venta")
plt.show()

### Regresión Lineal para evaluar impacto de variables

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Crear y entrenar el modelo en un pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("regression", LinearRegression())
])

pipeline.fit(X_train_transformed, y_train)
# Obtener coeficientes de regresión
coef_dict = dict(zip(X.columns, pipeline.named_steps["regression"].coef_))

print("\nCoeficientes de la regresión lineal:")
for var, coef in coef_dict.items():
    print(f"{var}: {coef:.2f}")


### Árbol de Decisión para relaciones no lineales

In [None]:
tree = DecisionTreeRegressor(max_depth=4)
tree.fit(X_train_transformed, y_train)

importances = tree.feature_importances_
feature_names = X_train_transformed.columns

importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
importance_df = importance_df.sort_values(by="Importance", ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=importance_df["Feature"], y=importance_df["Importance"], palette="Blues_r")

plt.xticks(rotation=45, ha="right") 
plt.title("Top 10 Variables más Importantes según Árbol de Decisión")
plt.xlabel("Variable")
plt.ylabel("Importancia")
plt.show()


In [None]:
fig, axes = plt.subplots(3, 2, figsize=(18, 12))
axes = axes.flatten()
top_vars = ["GrLivArea", "GarageArea", "TotalBsmtSF", "1stFlrSF", "OverallQual", "YearBuilt"]

for var, ax in zip(top_vars, axes):
    sns.histplot(train_df[var], bins=30, kde=True, ax=ax)
    ax.set_title(f'Distribución de {var}')
    
plt.tight_layout()
plt.show()


In [None]:
def evaluar_modelo(modelo, X_test_transformed, y_test, nombre_modelo):
    y_pred = modelo.predict(X_test_transformed)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"{nombre_modelo} - R2: {r2:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")
evaluar_modelo(pipeline, X_test_transformed, y_test, "Regresión Lineal")
evaluar_modelo(tree, X_test_transformed, y_test, "Árbol de Decisión")

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train_transformed, y_train)

evaluar_modelo(rf, X_test_transformed, y_test, "Random Forest")

In [None]:
svr = SVR(kernel='rbf', C=100, gamma=0.1)
svr.fit(X_train_transformed, y_train)

evaluar_modelo(svr, X_test_transformed, y_test, "SVR")

In [None]:
poly_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_model.fit(X_train_transformed, y_train)

evaluar_modelo(poly_model, X_test_transformed, y_test, "Regresión Polinomial")

## Modelo Univariado de Regresión Lineal

In [None]:
X_uni = X_train_transformed["GrLivArea"]
y_uni = y_train
X_uni = sm.add_constant(X_uni)
model_uni = sm.OLS(y_uni, X_uni).fit()
y_pred_uni = model_uni.predict(X_uni)
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_train_transformed["GrLivArea"], y=y_train, alpha=0.5)
plt.plot(X_train_transformed["GrLivArea"], y_pred_uni, color='red')
plt.xlabel("GrLivArea")
plt.ylabel("SalePrice")
plt.title("Regresión Lineal Univariada: GrLivArea vs SalePrice")
plt.show()
print(model_uni.summary())
