In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:

class HousingModel:
    def __init__(self):
        self.pipeline = None

    def load_data(self, filepath):  
        df = pd.read_csv(filepath)
        print(f"Data Loaded. Shape: {df.shape}")
        
        # Check for missing values
        missing = df.isnull().sum()
        if missing.any():
            print("\nMissing Values Detected (Handled in Pipeline):")
            print(missing[missing > 0])
            
        return df

    def build_pipeline(self):
        """
        Creates a pipeline that handles:
        1.numerical data -> impute missing values (median) -> scale 
        2.categorical data -> one-hot encode 
        3.model -> linear regression
        """
        
        #define which columns are which
        numeric_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                            'total_bedrooms', 'population', 'households', 'median_income']
        categorical_features = ['ocean_proximity']

        #prepro for numerical data
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')), # Fills missing 'total_bedrooms'
            ('scaler', StandardScaler())
        ])

        #prepro for categorical data
        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore')) # Converts text to numbers
        ])

        #bundling preprocessing 
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        #pipeline with model
        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', LinearRegression())
        ])
        
        return self.pipeline

    def train_evaluate(self, df):
        # Split Data
        X = df.drop('median_house_value', axis=1)
        y = df['median_house_value']
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        #train
        print("\nTraining model...")
        self.pipeline.fit(X_train, y_train)
        
        #Predict
        y_pred = self.pipeline.predict(X_test)
        
        #Evaluate
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        print("\n--- Evaluation Results ---")
        print(f"RMSE: ${rmse:,.2f}")
        print(f"R2 Score: {r2:.4f}")
        
        return X_test, y_test, y_pred


In [None]:

#Execution
if __name__ == "__main__":
    modeler = HousingModel()
    
    #Load your uploaded file
    df = modeler.load_data('house.csv')
    
    #Initialize Pipeline
    modeler.build_pipeline()
    
    #Train & Run
    X_test, y_test, y_pred = modeler.train_evaluate(df)
    
    #Show sample predictions
    print("\nSample Predictions:")
    comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    print(comparison.head())

Data Loaded. Shape: (20640, 10)

Missing Values Detected (Handled in Pipeline):
total_bedrooms    207
dtype: int64

Training model...

--- Evaluation Results ---
RMSE: $70,059.19
R2 Score: 0.6254

Sample Predictions:
         Actual      Predicted
20046   47700.0   54261.027690
3024    45800.0  124430.917728
15663  500001.0  255694.958282
20484  218600.0  268208.010360
9814   278000.0  262975.013606
