In [1]:
# Cargar solo las columnas numéricas del dataset Ames
import pandas as pd
Ames = pd.read_csv("Ames.csv").select_dtypes(include=["int64", "float64"])

# Eliminar columnas con valores faltantes
Ames = Ames.dropna(axis=1)

# Importar regresión lineal y selector secuencial de características
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Inicializar el modelo de regresión lineal
model = LinearRegression()

# Ejecutar el selector secuencial para elegir 1 característica
sfs = SequentialFeatureSelector(model, n_features_to_select=1)  # Usa cv=5 por defecto

# Definir X (características) e y (objetivo)
X = Ames.drop("SalePrice", axis=1)
y = Ames["SalePrice"]

# Ajustar el selector al conjunto de datos
sfs.fit(X, y)

# Obtener la característica seleccionada
selected_feature = X.columns[sfs.get_support()]
print("Feature selected for highest predictability:", selected_feature[0])

Feature selected for highest predictability: OverallQual


In [4]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

Ames = pd.read_csv("Ames.csv").select_dtypes(include=["int64", "float64"]).dropna(axis=1)
X = Ames.drop("SalePrice", axis=1)  # Features
y = Ames["SalePrice"]               # Target variable
model = LinearRegression()

# Dictionary to hold feature names and their corresponding mean CV R^2 scores
feature_scores = {}

# Iterate over each feature, perform CV, and store the mean R^2 score
for feature in X.columns:
    X_single = X[[feature]]
    cv_scores = cross_val_score(model, X_single, y, cv=5)
    feature_scores[feature] = cv_scores.mean()

# Sort features based on their mean CV R^2 scores in descending order
sorted_features = sorted(feature_scores.items(), key=lambda item: item[1], reverse=True)

# Print the top 3 features and their scores
top_3 = sorted_features[0:3]
for feature, score in top_3:
    print(f"Feature: {feature}, Mean CV R^2: {score:.4f}")

Feature: OverallQual, Mean CV R^2: 0.6183
Feature: GrLivArea, Mean CV R^2: 0.5127
Feature: 1stFlrSF, Mean CV R^2: 0.3957


In [6]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

Ames = pd.read_csv("Ames.csv").select_dtypes(include=["int64", "float64"]).dropna(axis=1)
X = Ames.drop("SalePrice", axis=1)  # Features
y = Ames["SalePrice"]               # Target variable

# Create a new feature
Ames['QualityArea'] = Ames['OverallQual'] * Ames['GrLivArea']

# Setting up the feature and target variable for the new 'QualityArea' feature
X = Ames[['QualityArea']]  # New feature
y = Ames['SalePrice']

# 5-Fold CV on Linear Regression
model = LinearRegression()
cv_scores = cross_val_score(model, X, y, cv=5)

# Calculating the mean of the CV scores
mean_cv_score = cv_scores.mean()
print(f"Mean CV R^2 score using 'Quality Weighted Area': {mean_cv_score:.4f}")

Mean CV R^2 score using 'Quality Weighted Area': 0.7484
