In [1]:
# Set up to obtain CV model performance and coefficient using k-fold
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

Ames = pd.read_csv("Ames.csv")
X = Ames[["GrLivArea"]].values  # get 2D matrix
y = Ames["SalePrice"].values    # get 1D vector

model = LinearRegression()
kf = KFold(n_splits=5)
coefs = []
scores = []

# Manually perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
    # Split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Fit the model, obtain fold performance and coefficient
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
    coefs.append(model.coef_)

mean_score = np.mean(scores)
print(f"Mean CV R^2 = {mean_score:.4f}")
mean_coefs = np.mean(coefs)
print(f"Mean Coefficient = {mean_coefs:.4f}")

Mean CV R^2 = 0.5127
Mean Coefficient = 110.5214


In [4]:
import pandas as pd
 
Ames = pd.read_csv("Ames.csv")
 
neighbor_stats = Ames.groupby("Neighborhood")["SalePrice"].agg(["count", "mean"]).sort_values(by="mean")
print(neighbor_stats.round(0).astype(int))

              count    mean
Neighborhood               
MeadowV          34   96836
BrDale           29  106095
IDOTRR           76  108103
BrkSide         103  126030
OldTown         213  126939
Edwards         165  133152
SWISU            42  133576
Landmrk           1  137000
Sawyer          139  137493
NPkVill          22  140743
Blueste          10  143590
NAmes           410  145087
Mitchel         104  162655
SawyerW         113  188102
Gilbert         143  189440
NWAmes          123  190372
Greens            8  193531
Blmngtn          23  196237
CollgCr         236  198133
Crawfor          92  202076
ClearCr          40  213981
Somerst         143  228762
Timber           54  242910
Veenker          23  251263
GrnHill           2  280000
StoneBr          43  305308
NridgHt         121  313662
NoRidge          67  326114


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# Cargar los datos
Ames = pd.read_csv("Ames.csv")

# One-hot encoding para "Neighborhood", excluyendo "MeadowV"
encoder = OneHotEncoder(sparse_output=False, drop=["MeadowV"])
X = encoder.fit_transform(Ames[["Neighborhood"]])
y = Ames["SalePrice"].values

# Configuración de KFold y almacenamiento de resultados
kf = KFold(n_splits=5)
scores = []
coefficients = []
intercept = []

# KFold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Guardar resultados de cada fold
    scores.append(model.score(X_test, y_test))
    coefficients.append(model.coef_)
    intercept.append(model.intercept_)

mean_score = np.mean(scores)
print(f"Mean CV R^2 = {mean_score:.4f}")

mean_coefficients = np.mean(coefficients, axis=0)
mean_intercept = np.mean(intercept)
print(f"Mean y-intercept = {mean_intercept:.0f}")

# Recuperar los nombres de los vecindarios del encoder, ajustando la categoría eliminada
neighborhoods = encoder.categories_[0]
if "MeadowV" in neighborhoods:
    neighborhoods = [name for name in neighborhoods if name != "MeadowV"]

# DataFrame para mostrar los coeficientes promedio de cada vecindario
coefficients_df = pd.DataFrame({
    "Neighborhood": neighborhoods,
    "Average Coefficient": mean_coefficients.round(0).astype(int)
})

# Mostrar resultados ordenados por el coeficiente
print(coefficients_df.sort_values(by="Average Coefficient").reset_index(drop=True))

Mean CV R^2 = 0.5408
Mean y-intercept = 96827
   Neighborhood  Average Coefficient
0        BrDale                 9221
1        IDOTRR                11335
2       BrkSide                29235
3       OldTown                30092
4       Landmrk                31729
5       Edwards                36305
6         SWISU                36848
7        Sawyer                40645
8       NPkVill                43988
9       Blueste                46388
10        NAmes                48274
11      Mitchel                65851
12      SawyerW                91252
13      Gilbert                92627
14       NWAmes                93521
15       Greens                96641
16      Blmngtn                99318
17      CollgCr               101342
18      Crawfor               105258
19      ClearCr               116993
20      Somerst               131844
21       Timber               146216
22      Veenker               155042
23      GrnHill               183173
24      StoneBr              

In [38]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load data
Ames = pd.read_csv("Ames.csv")

# Select features and target
features = Ames[["GrLivArea", "Neighborhood"]]
target = Ames["SalePrice"]

# Preprocess features using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", ["GrLivArea"]),
        ("cat", OneHotEncoder(sparse_output=False, drop=["MeadowV"], handle_unknown="ignore"), ["Neighborhood"])
    ]
)

# Fit and transform the features
X_transformed = preprocessor.fit_transform(features)

# Get feature names for the final DataFrame
feature_names = ["GrLivArea"] + list(preprocessor.named_transformers_["cat"].get_feature_names_out())

# Initialize KFold
kf = KFold(n_splits=5)

# Initialize variables to store results
coefficients_list = []
intercepts_list = []
scores = []

# Perform the KFold cross-validation
for train_index, test_index in kf.split(X_transformed):
    X_train, X_test = X_transformed[train_index], X_transformed[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    # Initialize the linear regression model
    model = LinearRegression()
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Store coefficients and intercepts
    coefficients_list.append(model.coef_)
    intercepts_list.append(model.intercept_)
    
    # Evaluate the model
    scores.append(model.score(X_test, y_test))

# Calculate the mean of scores, coefficients, and intercepts
average_score = np.mean(scores)
average_coefficients = np.mean(coefficients_list, axis=0)
average_intercept = np.mean(intercepts_list)

# Display the average R^2 score and y-intercept across all folds
# The y-intercept is the baseline price in "MeadowV" with no additional living area
print(f"Mean CV R^2 Score of Combined Model: {average_score:.4f}")
print(f"Mean y-intercept = {average_intercept:.0f}")

# Create a DataFrame for the coefficients
df_coefficients = pd.DataFrame({
    "Feature": feature_names,
    "Average Coefficient": average_coefficients
}).sort_values(by="Average Coefficient").reset_index(drop=True)

# Display the DataFrame
print("Coefficients for Combined Model:")
print(df_coefficients)

Mean CV R^2 Score of Combined Model: 0.7375
Mean y-intercept = 11786
Coefficients for Combined Model:
                 Feature  Average Coefficient
0     Neighborhood_SWISU         -3728.929853
1    Neighborhood_IDOTRR         -1498.971239
2              GrLivArea            78.938757
3   Neighborhood_OldTown          2363.805796
4    Neighborhood_BrDale          6551.114637
5   Neighborhood_BrkSide         16521.117849
6   Neighborhood_Landmrk         16921.529665
7   Neighborhood_Edwards         17520.110407
8   Neighborhood_NPkVill         30034.541748
9     Neighborhood_NAmes         31717.960146
10   Neighborhood_Sawyer         32009.140024
11  Neighborhood_Blueste         39908.310031
12   Neighborhood_NWAmes         44409.237736
13  Neighborhood_Mitchel         48013.229999
14  Neighborhood_SawyerW         48204.606372
15  Neighborhood_Gilbert         49255.248193
16  Neighborhood_Crawfor         55701.500795
17  Neighborhood_ClearCr         61737.497483
18  Neighborhood_CollgCr