In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

class MultiLinearRegression:
    def __init__(self, csv_file, target_column):
        self.csv_file = csv_file
        self.target_column = target_column
        self.data = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.coefficients = None
        self.intercept = None
    
    

    def load_data(self):
        self.data = pd.read_csv(self.csv_file)
    
        # Define categorical and numerical columns
        categorical_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                               'airconditioning', 'prefarea', 'furnishingstatus']
        numerical_columns = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
    
        # Convert categorical columns to one-hot encoding
        self.data = pd.get_dummies(self.data, columns=categorical_columns, drop_first=True)
    
        # Ensure all columns are numeric
        self.data = self.data.apply(pd.to_numeric, errors='coerce')
    
        # Drop rows with NaN values
        self.data = self.data.dropna()
    
        if self.data.empty:
            raise ValueError("No valid data found after preprocessing. Please check the dataset.")
    
        # Extract features (X) and target variable (y)
        X = self.data.drop(columns=[self.target_column]).values
        y = self.data[self.target_column].values
    
        if len(X) < 2:
            raise ValueError("Not enough data samples for training. Please provide a larger dataset.")
    
        # 🔥 Standardizing numerical features
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    
        # Split into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
    
        print(f"Data loaded successfully! Training samples: {len(self.X_train)}, Testing samples: {len(self.X_test)}")
        print(self.data.info())  # Check data types
        print(self.data.describe())  # Check for outliers & feature scaling



    
    def train_model(self):
        if self.X_train is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        # Convert training data to float
        self.X_train = self.X_train.astype(float)
        self.y_train = self.y_train.astype(float)
    
        # Add bias term (column of ones)
        X_b = np.c_[np.ones((self.X_train.shape[0], 1)), self.X_train] 
    
        # Use pseudo-inverse to avoid singular matrix issues
        theta = np.linalg.pinv(X_b.T.dot(X_b)).dot(X_b.T).dot(self.y_train)
    
        self.intercept = theta[0]
        self.coefficients = theta[1:]
    
        print("Model trained successfully!")

    def evaluate_model(self):
       
        if self.X_test is None:
            raise ValueError("Model not trained. Call train_model() first.")
        
        X_b_test = np.c_[np.ones((self.X_test.shape[0], 1)), self.X_test]
        predictions = X_b_test.dot(np.r_[self.intercept, self.coefficients])
        mse = np.mean((self.y_test - predictions) ** 2)
        return mse
    
    def predict(self, input_data):
      
        input_data = np.array(input_data)
        if input_data.ndim == 1:
            input_data = input_data.reshape(1, -1)
        X_b = np.c_[np.ones((input_data.shape[0], 1)), input_data]
        return X_b.dot(np.r_[self.intercept, self.coefficients])


In [48]:


# Initialize the class with CSV file path and target column
mlr = MultiLinearRegression("./dataset/Housing.csv", "price")

# Load data
mlr.load_data()

# Train the model
mlr.train_model()

# Evaluate the model
mse = mlr.evaluate_model()
print(f"Mean Squared Error: {mse}")




Data loaded successfully! Training samples: 436, Testing samples: 109
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   price                            545 non-null    int64
 1   area                             545 non-null    int64
 2   bedrooms                         545 non-null    int64
 3   bathrooms                        545 non-null    int64
 4   stories                          545 non-null    int64
 5   parking                          545 non-null    int64
 6   mainroad_yes                     545 non-null    bool 
 7   guestroom_yes                    545 non-null    bool 
 8   basement_yes                     545 non-null    bool 
 9   hotwaterheating_yes              545 non-null    bool 
 10  airconditioning_yes              545 non-null    bool 
 11  prefarea_yes                     545 non