In [3]:
import pandas as pd

df = pd.read_csv("ames_housing_no_missing.csv")
print(df.head())


   MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0          60       RL         65.0     8450   Pave  Grvl      Reg   
1          20       RL         80.0     9600   Pave  Grvl      Reg   
2          60       RL         68.0    11250   Pave  Grvl      IR1   
3          70       RL         60.0     9550   Pave  Grvl      IR1   
4          60       RL         84.0    14260   Pave  Grvl      IR1   

  LandContour Utilities LotConfig  ... PoolArea PoolQC  Fence MiscFeature  \
0         Lvl    AllPub    Inside  ...        0     Gd  MnPrv        Shed   
1         Lvl    AllPub       FR2  ...        0     Gd  MnPrv        Shed   
2         Lvl    AllPub    Inside  ...        0     Gd  MnPrv        Shed   
3         Lvl    AllPub    Corner  ...        0     Gd  MnPrv        Shed   
4         Lvl    AllPub       FR2  ...        0     Gd  MnPrv        Shed   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
0       0      2    2008        WD         Normal    

In [4]:
print(df.info())
print(df.describe())
print(df.columns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [5]:
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Identify categorical and numeric columns
categorical = X.select_dtypes(include="object").columns.tolist()
numerical = X.select_dtypes(exclude="object").columns.tolist()

# Create a column transformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", StandardScaler(), numerical)
])


In [7]:
# Create a pipeline with preprocessing + model
model = make_pipeline(preprocessor, LinearRegression())

# Fit the model
model.fit(X_train, y_train)


In [9]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Option 1: Just print MSE
print("MSE:", mean_squared_error(y_test, y_pred))

# Option 2: Calculate RMSE manually
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)


MSE: 788848047.3568481
RMSE: 28086.438851460825


In [10]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)


R² Score: 0.8873928271453743


In [20]:
# Ensure custom_data matches the model's expected feature names
custom_data = {
    'MSSubClass': [60],  # Example numerical value
    'MSZoning': ['RL'],  # Example categorical value
    'LotFrontage': [80.0],  # Example numerical value
    'LotArea': [9600],  # Example numerical value
    'Street': ['Pave'],  # Example categorical value
    'Alley': ['Grvl'],  # Example categorical value
    'LotShape': ['Reg'],  # Example categorical value
    'LandContour': ['Lvl'],  # Example categorical value
    'Utilities': ['AllPub'],  # Example categorical value
    'LotConfig': ['Inside'],  # Example categorical value
    'LandSlope': ['Gtl'],  # Example categorical value
    'Neighborhood': ['CollgCr'],  # Example categorical value
    'Condition1': ['Norm'],  # Example categorical value
    'Condition2': ['Norm'],  # Example categorical value
    'BldgType': ['1Fam'],  # Example categorical value
    'HouseStyle': ['2Story'],  # Example categorical value
    'OverallQual': [7],  # Example numerical value
    'OverallCond': [5],  # Example numerical value
    'YearBuilt': [2000],  # Example numerical value
    'YearRemodAdd': [2005],  # Example numerical value
    'TotRmsAbvGrd': [8],  # Example numerical value
    'Fireplaces': [1],  # Example numerical value
    'GarageCars': [2],  # Example numerical value
    'GarageArea': [500],  # Example numerical value
    'BsmtQual': ['TA'],  # Example categorical value
    'BsmtCond': ['TA'],  # Example categorical value
    'BsmtExposure': ['No'],  # Example categorical value
    'BsmtFinType1': ['GLQ'],  # Example categorical value
    'BsmtFinSF1': [500],  # Example numerical value
    'BsmtFinType2': ['ALQ'],  # Example categorical value
    'BsmtFinSF2': [0],  # Example numerical value
    'BsmtUnfSF': [400],  # Example numerical value
    'TotalBsmtSF': [900],  # Example numerical value
    '1stFlrSF': [1200],  # Example numerical value
    '2ndFlrSF': [800],  # Example numerical value
    'LowQualFinSF': [0],  # Example numerical value
    'GrLivArea': [2000],  # Example numerical value
    'BsmtHalfBath': [0],  # Example numerical value
    'HalfBath': [1],  # Example numerical value
    'FullBath': [2],  # Example numerical value
    'BedroomAbvGr': [3],  # Example numerical value
    'KitchenAbvGr': [1],  # Example numerical value
    'TotRmsAbvGrd': [8],  # Example numerical value
    'Functional': ['Typ'],  # Example categorical value
    'GarageFinish': ['RFn'],  # Example categorical value
    'GarageQual': ['TA'],  # Example categorical value
    'GarageCond': ['TA'],  # Example categorical value
    'PoolQC': ['Ex'],  # Example categorical value
    'Fence': ['MnPrv'],  # Example categorical value
    'MiscFeature': ['Gar2'],  # Example categorical value
    'MiscVal': [500],  # Example numerical value
    'MoSold': [6],  # Example numerical value
    'YrSold': [2010],  # Example numerical value
    'SaleType': ['WD'],  # Example categorical value
    'SaleCondition': ['Normal'],  # Example categorical value
}

# Create the DataFrame for custom data
custom_df = pd.DataFrame(custom_data)

# Ensure the custom data frame has the same columns in the same order as the model expects
missing_columns = [col for col in model.feature_names_in_ if col not in custom_df.columns]

# Identify which columns are categorical and numerical
categorical_columns = custom_df.select_dtypes(include=['object']).columns
numerical_columns = custom_df.select_dtypes(include=['int64', 'float64']).columns

# Add missing columns with default values
for col in missing_columns:
    if col in categorical_columns:  # Categorical columns should have a placeholder like 'Unknown'
        custom_df[col] = 'Unknown'
    else:  # Numeric columns should have a value like 0
        custom_df[col] = 0

# Re-order columns to match the model's expected order
custom_df = custom_df[model.feature_names_in_]

# Ensure all columns are correctly typed (convert to numeric where necessary)
for col in numerical_columns:
    custom_df[col] = pd.to_numeric(custom_df[col], errors='coerce')

# Check if there are any NaN values after conversion
custom_df = custom_df.fillna(0)

# Use the model to make predictions
predicted_price = model.predict(custom_df)

# Output the predicted house price
print(f"Predicted House Price: ${predicted_price[0]:,.2f}")


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# Assuming the model is already trained

# Define the preprocessor to handle categorical features
categorical_columns = custom_df.select_dtypes(include=['object']).columns

# We are creating a ColumnTransformer to handle categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'  # Leave numerical columns unchanged
)

# Fit the preprocessor on the custom data (this is needed before transformation)
preprocessor.fit(custom_df)

# Transform the custom data
custom_data_transformed = preprocessor.transform(custom_df)

# Get the feature names after one-hot encoding
encoded_column_names = preprocessor.transformers_[0][1].get_feature_names_out(categorical_columns)

# Combine the encoded columns with the non-encoded numerical columns
numerical_columns = custom_df.select_dtypes(include=['number']).columns
all_column_names = list(encoded_column_names) + list(numerical_columns)

# Convert the transformed data to a DataFrame with the new column names
custom_data_df = pd.DataFrame(custom_data_transformed, columns=all_column_names)

# Ensure that the transformed DataFrame matches the model's expected features
# If any columns are missing, we can fill them with defaults (e.g., zeros for numerical columns and 'Unknown' for categorical ones)
missing_columns = set(model.feature_names_in_) - set(custom_data_df.columns)

for col in missing_columns:
    if col in numerical_columns:
        custom_data_df[col] = 0  # Default value for numerical columns
    else:
        custom_data_df[col] = 'Unknown'  # Default value for categorical columns

# Reorder columns to match the model's expected order
custom_data_df = custom_data_df[model.feature_names_in_]

# Use your trained model to make predictions on the transformed DataFrame
predicted_price = model.predict(custom_data_df)

# Output the predicted house price
print(f"Predicted House Price: ${predicted_price[0]:,.2f}")


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''